Files
ddys-client/DDYSClient/Services/HTMLParser.swift
2026-02-26 22:15:35 +08:00

377 lines
14 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Foundation
import SwiftSoup
struct PaginationInfo {
let current: Int
let total: Int
}
enum HTMLParser {
// MARK: -
static func parseContentList(html: String, defaultCategory: ContentCategory = .movie) throws -> [ContentItem] {
let doc = try SwiftSoup.parse(html)
let cards = try doc.select(".movie-card")
var items: [ContentItem] = []
for card in cards {
guard let link = try card.select("a[href^=/movie/]").first() else { continue }
let href = try link.attr("href")
let slug = String(href.replacingOccurrences(of: "/movie/", with: ""))
guard !slug.isEmpty else { continue }
let title = try card.select("h3 a").text().trimmingCharacters(in: .whitespacesAndNewlines)
guard !title.isEmpty else { continue }
let imgSrc = try card.select("img").attr("src")
let posterURL = URL(string: imgSrc)
let ratingText = try card.select(".badge-top-right").text().trimmingCharacters(in: .whitespacesAndNewlines)
let rating = Double(ratingText)
var badges: [String] = []
let topLeftBadge = try card.select(".badge-top-left").text().trimmingCharacters(in: .whitespacesAndNewlines)
if !topLeftBadge.isEmpty { badges.append(topLeftBadge) }
let bottomRightBadge = try card.select(".badge-bottom-right").text().trimmingCharacters(in: .whitespacesAndNewlines)
if !bottomRightBadge.isEmpty { badges.append(bottomRightBadge) }
// "2025 · "
let metaDiv = try card.select(".p-4 .text-xs.font-light, .p-4 .text-xs.text-gray-500").first()
let metaText = try metaDiv?.text().trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
var year = 0
var category = defaultCategory
let metaParts = metaText.split(separator: "·").map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
if let firstPart = metaParts.first, let y = Int(firstPart) {
year = y
}
if metaParts.count > 1 {
switch metaParts[1] {
case "电影": category = .movie
case "剧集", "电视剧": category = .series
case "综艺": category = .variety
case "动漫", "动画": category = .anime
default: break
}
}
var onlineCount = 0
var netdiskCount = 0
let spans = try card.select(".flex.items-center.gap-3 span")
for span in spans {
let spanText = try span.text()
if spanText.contains("在线") {
onlineCount = Int(spanText.replacingOccurrences(of: "在线:", with: "").trimmingCharacters(in: .whitespaces)) ?? 0
} else if spanText.contains("网盘") {
netdiskCount = Int(spanText.replacingOccurrences(of: "网盘:", with: "").trimmingCharacters(in: .whitespaces)) ?? 0
}
}
items.append(ContentItem(
id: slug,
title: title,
year: year,
category: category,
rating: rating,
posterURL: posterURL,
badges: badges,
onlineCount: onlineCount,
netdiskCount: netdiskCount,
detailURL: href
))
}
return items
}
// MARK: -
static func parseContentDetail(html: String) throws -> ContentDetail {
let doc = try SwiftSoup.parse(html)
// (h1 span)
let h1 = try doc.select("h1").first()
let fullTitle = try h1?.text().trimmingCharacters(in: .whitespacesAndNewlines) ?? "未知标题"
//
let imgSrc = try doc.select("img.w-full.h-full.object-cover").first()?.attr("src") ?? ""
let posterURL = URL(string: imgSrc)
// === JSON-LD ===
var year = 0
var rating: Double?
var directors: [String] = []
var actors: [String] = []
var genres: [String] = []
var description = ""
var region = ""
if let jsonLDScript = try doc.select("script[type=application/ld+json]").first() {
let jsonText = try jsonLDScript.data()
if let data = jsonText.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] {
if let published = json["datePublished"] as? String {
year = Int(published) ?? 0
}
if let agg = json["aggregateRating"] as? [String: Any],
let rv = agg["ratingValue"] as? String {
rating = Double(rv)
}
if let dirArray = json["director"] as? [[String: Any]] {
directors = dirArray.compactMap { $0["name"] as? String }
}
if let actArray = json["actor"] as? [[String: Any]] {
actors = actArray.compactMap { $0["name"] as? String }
}
if let genreArray = json["genre"] as? [String] {
genres = genreArray
}
if let desc = json["description"] as? String {
description = desc
}
}
}
// === HTML ===
// fallback
if rating == nil {
let ratingText = try doc.select(".rating-display").text()
let cleaned = ratingText.components(separatedBy: CharacterSet.decimalDigits.union(CharacterSet(charactersIn: ".")).inverted).joined()
rating = Double(cleaned)
}
// "2025 · · / / "
let metaDivs = try doc.select(".text-xs.text-gray-600.font-light, .text-sm.text-gray-600.font-light")
for metaDiv in metaDivs {
let text = try metaDiv.text().trimmingCharacters(in: .whitespacesAndNewlines)
if text.contains("·") {
let parts = text.split(separator: "·").map { $0.trimmingCharacters(in: .whitespaces) }
if parts.count >= 1, let y = Int(parts[0]), year == 0 {
year = y
}
if parts.count >= 2 && region.isEmpty {
region = parts[1]
}
if parts.count >= 3 && genres.isEmpty {
genres = parts[2].split(separator: "/").map { $0.trimmingCharacters(in: .whitespaces) }
}
break
}
}
// fallback: <span></span><span>xxx</span>
if directors.isEmpty {
let dirDivs = try doc.select("div")
for div in dirDivs {
let text = try div.text()
if text.hasPrefix("导演:") || text.hasPrefix("导演:") {
let children = try div.select("span")
if children.size() >= 2 {
let dirText = try children.last()?.text() ?? ""
directors = dirText.split(separator: "/").map { $0.trimmingCharacters(in: .whitespaces) }
}
break
}
}
}
// fallback
if actors.isEmpty {
let actDivs = try doc.select("div")
for div in actDivs {
let text = try div.text()
if text.hasPrefix("主演:") || text.hasPrefix("主演:") {
let children = try div.select("span")
if children.size() >= 2 {
let actText = try children.last()?.text() ?? ""
actors = actText.split(separator: "/").map { $0.trimmingCharacters(in: .whitespaces) }
}
break
}
}
}
// fallback: .prose p
if description.isEmpty {
let prosePs = try doc.select(".prose p")
let texts = try prosePs.map { try $0.text() }
description = texts.joined(separator: "\n\n")
}
// === ===
let sources = try parseSourceTabs(doc: doc)
// slug
let canonicalHref = try doc.select("link[rel=canonical]").attr("href")
let slug: String
if !canonicalHref.isEmpty {
slug = String(canonicalHref.split(separator: "/").last ?? Substring(fullTitle))
} else {
slug = fullTitle
}
let contentItem = ContentItem(
id: slug,
title: fullTitle,
year: year,
category: .movie,
rating: rating,
posterURL: posterURL,
badges: [],
onlineCount: 0,
netdiskCount: 0,
detailURL: "/movie/\(slug)"
)
//
let firstSourceEpisodes = sources.first?.episodes
return ContentDetail(
item: contentItem,
description: description,
directors: directors,
actors: actors,
genres: genres,
region: region,
sources: sources,
episodes: (firstSourceEpisodes?.count ?? 0) > 1 ? firstSourceEpisodes : nil
)
}
// MARK: -
private static func parseSourceTabs(doc: Document) throws -> [StreamSource] {
var sources: [StreamSource] = []
// onclick
let buttons = try doc.select("button[onclick^=switchSource]")
for (index, button) in buttons.enumerated() {
let onclick = try button.attr("onclick")
let name = try button.text().trimmingCharacters(in: .whitespacesAndNewlines)
guard let parsed = parseSwitchSource(onclick) else { continue }
let episodes = parseEpisodes(urlString: parsed.url)
let quality = parsed.format == "m3u8" ? "HLS" : parsed.format.uppercased()
sources.append(StreamSource(
id: parsed.id,
name: name.isEmpty ? "播放源 \(index + 1)" : name,
quality: quality,
episodes: episodes
))
}
// fallback: script
if sources.isEmpty {
let scripts = try doc.select("script")
for script in scripts {
let content = try script.data()
let pattern = #"switchSource\((\d+),\s*'([^']*)',\s*'([^']*)'\)"#
guard let regex = try? NSRegularExpression(pattern: pattern) else { continue }
let matches = regex.matches(in: content, range: NSRange(content.startIndex..., in: content))
for (index, match) in matches.enumerated() {
guard match.numberOfRanges >= 4 else { continue }
let idStr = String(content[Range(match.range(at: 1), in: content)!])
let url = String(content[Range(match.range(at: 2), in: content)!])
let format = String(content[Range(match.range(at: 3), in: content)!])
let sourceId = Int(idStr) ?? index
let episodes = parseEpisodes(urlString: url)
sources.append(StreamSource(
id: sourceId,
name: "播放源 \(index + 1)",
quality: format == "m3u8" ? "HLS" : format.uppercased(),
episodes: episodes
))
}
if !sources.isEmpty { break }
}
}
return sources
}
private static func parseSwitchSource(_ onclick: String) -> (id: Int, url: String, format: String)? {
let pattern = #"switchSource\((\d+),\s*'([^']*)',\s*'([^']*)'\)"#
guard let regex = try? NSRegularExpression(pattern: pattern),
let match = regex.firstMatch(in: onclick, range: NSRange(onclick.startIndex..., in: onclick)),
match.numberOfRanges >= 4 else { return nil }
let idStr = String(onclick[Range(match.range(at: 1), in: onclick)!])
let url = String(onclick[Range(match.range(at: 2), in: onclick)!])
let format = String(onclick[Range(match.range(at: 3), in: onclick)!])
return (Int(idStr) ?? 0, url, format)
}
// MARK: -
static func parseEpisodes(urlString: String) -> [Episode] {
let parts = urlString.split(separator: "#")
var episodes: [Episode] = []
for (index, part) in parts.enumerated() {
let partStr = String(part)
if partStr.contains("$") {
let episodeParts = partStr.split(separator: "$", maxSplits: 1)
if episodeParts.count == 2 {
episodes.append(Episode(
id: index,
name: String(episodeParts[0]),
url: String(episodeParts[1])
))
}
}
}
if episodes.isEmpty && !urlString.isEmpty {
episodes.append(Episode(id: 0, name: "播放", url: urlString))
}
return episodes
}
// MARK: -
static func parsePagination(html: String) throws -> PaginationInfo {
let doc = try SwiftSoup.parse(html)
let activeBtn = try doc.select(".pagination-active")
let currentPage = Int(try activeBtn.text().trimmingCharacters(in: .whitespacesAndNewlines)) ?? 1
var maxPage = currentPage
let allBtns = try doc.select(".pagination-btn.pagination-number")
for btn in allBtns {
let text = try btn.text().trimmingCharacters(in: .whitespacesAndNewlines)
if let pageNum = Int(text), pageNum > maxPage {
maxPage = pageNum
}
}
let nextLink = try doc.select(".pagination-next").attr("href")
if let range = nextLink.range(of: #"/page/(\d+)"#, options: .regularExpression) {
let pageStr = nextLink[range].replacingOccurrences(of: "/page/", with: "")
if let p = Int(pageStr), p > maxPage {
maxPage = p
}
}
return PaginationInfo(current: currentPage, total: maxPage)
}
// MARK: -
static func parseHomeSections(html: String) throws -> [[ContentItem]] {
let allItems = try parseContentList(html: html)
if allItems.count > 10 {
return [Array(allItems.prefix(10)), Array(allItems.suffix(from: 10))]
}
return [allItems]
}
}