diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 7866052..b17a382 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -5,6 +5,7 @@ from lxml import etree#need install import json from ADC_function import * from WebCrawler.storyline import getStoryline +import inspect def getActorPhoto(html): actors = html.xpath('//div[@class="star-name"]/a') @@ -60,6 +61,8 @@ def getCID(html): result = re.sub('/.*?.jpg','',string) return result def getOutline(number, title): #获取剧情介绍 多进程并发查询 + if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): + return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 return getStoryline(number,title) def getSeriseJa(html): x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 2d58296..75ee517 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -10,7 +10,7 @@ from difflib import SequenceMatcher from unicodedata import category from number_parser import is_uncensored -G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"} +G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"} G_mode_txt = ('顺序执行','线程池','进程池') @@ -83,6 +83,8 @@ def _getStoryline_mp(site, number, title, debug): storyline = None if not isinstance(site, str): return storyline + elif site == "airavwiki": + storyline = getStoryline_airavwiki(number, debug) elif site == "airav": storyline = getStoryline_airav(number, debug) elif site == "avno1": @@ -113,9 +115,9 @@ def getStoryline_airav(number, debug): if not res.ok: raise ValueError(f"get_html_by_browser('{url}') failed") avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div') - if number_up not in avs.select_one('a > h3').text.upper(): + if number_up not in avs.a.h3.text.upper(): raise ValueError("number not found") - detail_url = avs.select_one('a')['href'] + detail_url = avs.a['href'] res = browser.open_relative(detail_url) if not res.ok: raise ValueError(f"browser.open_relative('{detail_url}') failed") @@ -132,6 +134,38 @@ def getStoryline_airav(number, debug): return None +def getStoryline_airavwiki(number, debug): + try: + kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number + url = f'https://cn.airav.wiki/?search={kwd}' + result, browser = get_html_by_browser(url, return_type='browser') + if not result.ok: + raise ValueError(f"get_html_by_browser('{url}','{number}') failed") + s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block') + link = None + for a in s: + title = a.img['title'] + if re.search(number, title, re.I): + link = a + break + if link is None: + raise ValueError("number not found") + result = browser.follow_link(link) + if not result.ok or not re.search(number, browser.url, re.I): + raise ValueError("detail page not found") + title = browser.page.select('head > title')[0].text.strip() + detail_number = str(re.findall('\[(.*?)]', title)[0]) + if not re.search(number, detail_number, re.I): + raise ValueError("detail page number not match, got ->[{detail_number}]") + desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip() + return desc + except Exception as e: + if debug: + print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].") + pass + return '' + + def getStoryline_58avgo(number, debug): try: url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ diff --git a/config.ini b/config.ini index 7db538c..c06f26d 100755 --- a/config.ini +++ b/config.ini @@ -91,14 +91,14 @@ extrafanart_folder=extrafanart [storyline] ; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的 ; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。 -; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者 -; 流出破解马赛克的影片(此功能没使用)。 +; 其中airavwiki airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,airavwiki avno1有码无码都能查, +; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。 ; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, ; 设置成不查询可大幅提高刮削速度。 ; site= -site=3:avno1 -censored_site=1:airav,4:xcity,5:amazon -uncensored_site=2:58avgo +site=1:airavwiki,4:avno1 +censored_site=2:airav,5:xcity,6:amazon +uncensored_site=3:58avgo ; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) run_mode=1 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因