diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index 3e583df..790b910 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -60,14 +60,9 @@ def get_year(lx: html.HtmlElement) -> str: def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() - - storyline_site = config.getInstance().storyline_site().split(',') - a = set(storyline_site) & {'airav', 'avno1'} - if len(a): - site = [n for n in storyline_site if n in a] - g = getStoryline(number, title, site) - if len(g): - return g + g = getStoryline(number, title) + if len(g): + return g return o def get_release(lx: html.HtmlElement) -> str: diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 5c2b91a..9b0a44c 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -8,8 +8,9 @@ from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool from difflib import SequenceMatcher from unicodedata import category +from number_parser import is_uncensored -G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"} +G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"} G_mode_txt = ('顺序执行','线程池','进程池') @@ -28,7 +29,16 @@ def getStoryline(number, title, sites: list=None): conf = config.getInstance() debug = conf.debug() or conf.storyline_show() == 2 storyine_sites = conf.storyline_site().split(',') if sites is None else sites - apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] + if is_uncensored(number): + storyine_sites += conf.storyline_uncensored_site().split(',') + else: + storyine_sites += conf.storyline_censored_site().split(',') + r_dup = set() + apply_sites = [] + for s in storyine_sites: + if s in G_registered_storyline_site and s not in r_dup: + apply_sites.append(s) + r_dup.add(s) mp_args = ((site, number, title, debug) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count()) if cores == 0: @@ -80,6 +90,8 @@ def _getStoryline_mp(site, number, title, debug): storyline = getStoryline_xcity(number, debug) elif site == "amazon": storyline = getStoryline_amazon(title, number, debug) + elif site == "58avgo": + storyline = getStoryline_58avgo(number, debug) if not debug: return storyline print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( @@ -119,24 +131,63 @@ def getStoryline_airav(number, debug): return None +def getStoryline_58avgo(number, debug): + try: + url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ + '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', + '?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 + kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number + result, browser = get_html_by_form(url, + fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, + return_type = 'browser') + if not result.ok: + raise ValueError(f"get_html_by_form('{url}','{number}') failed") + if f'searchresults.aspx?Search={kwd}' not in browser.url: + raise ValueError("number not found") + s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click') + link = None + for i in range(len(s)): + title = s[i].h3.text.strip() + if re.search(number, title, re.I): + link = s[i] + break; + if link is None: + raise ValueError("number not found") + result = browser.follow_link(link) + if not result.ok or 'playon.aspx' not in browser.url: + raise ValueError("detail page not found") + title = browser.page.select('head > title')[0].text.strip() + detail_number = str(re.findall('\[(.*?)]', title)[0]) + if not re.search(number, detail_number, re.I): + raise ValueError("detail page number not match, got ->[{detail_number}]") + return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].") + pass + return '' + + def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 try: url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 - number_up = number.upper() result, browser = get_html_by_form(url, form_select='div.wrapper > div.header > div.search > form', - fields = {'kw' : number_up}, + fields = {'kw' : number}, return_type = 'browser') if not result.ok: - raise ValueError(f"get_html_by_form('{url}','{number_up}') failed") - title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() - page_number = title[title.rfind(' '):].upper() - if not number_up in page_number: - raise ValueError(f"page number ->[{page_number}] not match") - return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() + raise ValueError(f"get_html_by_form('{url}','{number}') failed") + s = browser.page.select('div.type_movie > div > ul > li > div') + for i in range(len(s)): + title = s[i].a.h3.text.strip() + page_number = title[title.rfind(' '):].strip() + if re.search(number, page_number, re.I): + return s[i]['data-description'].strip() + raise ValueError(f"page number ->[{page_number}] not match") except Exception as e: if debug: print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 6eb208d..ed381e7 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -122,7 +122,7 @@ def getDirector(html): def getOutline(html, number, title): storyline_site = config.getInstance().storyline_site().split(',') - a = set(storyline_site) & {'airav', 'avno1'} + a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字 if len(a): site = [n for n in storyline_site if n in a] g = getStoryline(number, title, site) diff --git a/config.ini b/config.ini index b4d9fb4..eef14db 100755 --- a/config.ini +++ b/config.ini @@ -86,11 +86,16 @@ extrafanart_folder=extrafanart ; 剧情简介 [storyline] -; website为javbus或javdb时,site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询,取值优先级 -; 从左到右,靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介,xcity和amazon是日语的,由 -; 于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。 +; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的 +; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。 +; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者 +; 流出破解马赛克的影片(此功能没使用)。 +; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, +; 设置成不查询可大幅提高刮削速度。 ; site= -site=airav,avno1,xcity,amazon +site=avno1 +censored_site=airav,xcity,amazon +uncensored_site=58avgo ; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) run_mode=1 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 diff --git a/config.py b/config.py index 83a36bc..f6d6488 100644 --- a/config.py +++ b/config.py @@ -243,7 +243,19 @@ class Config: try: return self.conf.get("storyline", "site") except: - return "airav,avno1,xcity,amazon" + return "avno1" + + def storyline_censored_site(self) -> str: + try: + return self.conf.get("storyline", "censored_site") + except: + return "airav,xcity,amazon" + + def storyline_uncensored_site(self) -> str: + try: + return self.conf.get("storyline", "uncensored_site") + except: + return "58avgo" def storyline_show(self) -> int: try: @@ -354,7 +366,9 @@ class Config: sec14 = "storyline" conf.add_section(sec14) - conf.set(sec14, "site", "airav,avno1,xcity,amazon") + conf.set(sec14, "site", "avno1") + conf.set(sec14, "censored_site", "airav,xcity,amazon") + conf.set(sec14, "uncensored_site", "58avgo") conf.set(sec14, "show_result", 0) conf.set(sec14, "run_mode", 1) diff --git a/core.py b/core.py index d7066f4..24c1ce5 100755 --- a/core.py +++ b/core.py @@ -566,10 +566,7 @@ def core_main(file_path, number_th): c_word = '-C' # 中文字幕影片后缀 # 判断是否无码 - if is_uncensored(number): - uncensored = 1 - else: - uncensored = 0 + uncensored = 1 if is_uncensored(number) else 0 if '流出' in filepath or 'uncensored' in filepath: diff --git a/number_parser.py b/number_parser.py index 212c2c0..4d4fe93 100755 --- a/number_parser.py +++ b/number_parser.py @@ -71,7 +71,7 @@ G_TAKE_NUM_RULES = { '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'), 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()), 'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]), - 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[-|_]{1}(\d{3,4})[^\d]*', x, re.I)[0]) + 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]) } def get_number_by_dict(filename: str) -> str: