diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index 44f9094..039fed0 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str: replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK + replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK replace('&', '&')) diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index c1a25d9..3e583df 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -4,26 +4,29 @@ import json from lxml import html import re from ADC_function import * +from WebCrawler.storyline import getStoryline def main(number: str) -> json: try: - carib_obj, browser = get_html_by_browser( - 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', - return_type="browser") - - if not carib_obj or not carib_obj.ok: + # 因演员图片功能还未使用,为提速暂时注释,改为用get_html() + #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html', + # return_type='browser') + #if not r.ok: + # raise ValueError("page not found") + #htmlcode = str(browser.page) + htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content') + htmlcode = htmlbyte.decode('euc-jp') + if not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode: raise ValueError("page not found") - lx = html.fromstring(str(browser.page)) - - if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): - raise ValueError("page info not found") + lx = html.fromstring(htmlcode) + title = get_title(lx) dic = { - 'title': get_title(lx), + 'title': title, 'studio': '加勒比', 'year': get_year(lx), - 'outline': get_outline(lx), + 'outline': get_outline(lx, number, title), 'runtime': get_runtime(lx), 'director': '', 'actor': get_actor(lx), @@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] -def get_outline(lx: html.HtmlElement) -> str: - return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() +def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: + o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() + + storyline_site = config.getInstance().storyline_site().split(',') + a = set(storyline_site) & {'airav', 'avno1'} + if len(a): + site = [n for n in storyline_site if n in a] + g = getStoryline(number, title, site) + if len(g): + return g + return o def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 693f404..5c2b91a 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -23,11 +23,11 @@ class noThread(object): # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 -def getStoryline(number, title): +def getStoryline(number, title, sites: list=None): start_time = time.time() conf = config.getInstance() debug = conf.debug() or conf.storyline_show() == 2 - storyine_sites = conf.storyline_site().split(',') + storyine_sites = conf.storyline_site().split(',') if sites is None else sites apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] mp_args = ((site, number, title, debug) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count())