diff --git a/ADC_function.py b/ADC_function.py index e5afb4b..e43fe5f 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -85,12 +85,11 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): + s = None if isinstance(cookies, dict) and len(cookies): s = requests.Session() requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) - else: - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() @@ -109,12 +108,11 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): + s = None if isinstance(cookies, dict) and len(cookies): s = requests.Session() requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) - else: - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index d9c54b2..6c13e5d 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -416,7 +416,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number): print('[!]', err) -if __name__ == '__main__': +def main(): version = '5.0.1' urllib3.disable_warnings() #Ignore http proxy warning @@ -483,6 +483,7 @@ if __name__ == '__main__': count = 0 count_all = str(len(movie_list)) print('[+]Find', count_all, 'movies.') + print('[*]======================================================') stop_count = conf.stop_counter() if stop_count<1: stop_count = 999999 @@ -517,3 +518,8 @@ if __name__ == '__main__': input("Press enter key exit, you can check the error message before you exit...") sys.exit(0) + +import multiprocessing +if __name__ == '__main__': + multiprocessing.freeze_support() + main() diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 46628cf..c9d53f3 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -6,6 +6,7 @@ from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * +from WebCrawler.storyline import getStoryline import inspect def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img @@ -91,33 +92,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 except: pass return '' -def getOutline(number): #获取剧情介绍 从avno1.cc取得 - try: - url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + - secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), - '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' - ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 - number_up = number.upper() - result, browser = get_html_by_form(url, - form_select='div.wrapper > div.header > div.search > form', - fields = {'kw' : number_up}, - return_type = 'browser') - if not result.ok: - raise - title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() - page_number = title[title.rfind(' '):].upper() - if not number_up in page_number: - raise - return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() - except: - pass - try: - from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline - detail_html, browser = open_by_browser(number) - return xcity_getOutline(detail_html) - except: - pass - return '' +def getOutline(number, title): #获取剧情介绍 多进程并发查询 + return getStoryline(number,title) def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 @@ -156,11 +132,12 @@ def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) if "404 Page Not Found" in htmlcode: raise Exception('404 page not found') + title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','') dic = { - 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), + 'title': title, 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), - 'outline': getOutline(number), + 'outline': getOutline(number, title), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), @@ -189,11 +166,12 @@ def main(number): htmlcode = get_html('https://www.javbus.com/' + number) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') + title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode))) dic = { - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), + 'title': title, 'studio': getStudio(htmlcode), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), - 'outline': getOutline(number), + 'outline': getOutline(number, title), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), @@ -225,7 +203,11 @@ def main(number): return js if __name__ == "__main__" : - #print(main('ADV-R0624')) # 404 + config.G_conf_override['debug_mode:switch'] = True + print(main('ABP-888')) + print(main('ABP-960')) + # print(main('ADV-R0624')) # 404 + # print(main('MMNT-010')) print(main('ipx-292')) print(main('CEMD-011')) print(main('CJOD-278')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 185d96b..241de49 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -1,13 +1,11 @@ import sys - -from mechanicalsoup.stateful_browser import StatefulBrowser sys.path.append('../') import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * -# import sys +from mechanicalsoup.stateful_browser import StatefulBrowser +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) @@ -206,9 +204,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 except: pass return '' -def getOutline(number): #获取剧情介绍 - from WebCrawler.javbus import getOutline as javbus_getOutline - return javbus_getOutline(number) +def getOutline(number, title): #获取剧情介绍 多进程并发查询 + return getStoryline(number,title) def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -309,7 +306,7 @@ def main(number): 'actor': getActor(detail_page), 'title': title, 'studio': getStudio(detail_page), - 'outline': getOutline(number), + 'outline': getOutline(number, title), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), @@ -350,11 +347,13 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) - print(main('070116-197')) - print(main('093021_539')) # 没有剧照 片商pacopacomama - print(main('FC2-2278260')) - print(main('FC2-735670')) + # print(main('070116-197')) + # print(main('093021_539')) # 没有剧照 片商pacopacomama + # print(main('FC2-2278260')) + # print(main('FC2-735670')) # print(main('FC2-1174949')) # not found print(main('MVSD-439')) # print(main('EHM0001')) # not found - print(main('FC2-2314275')) + # print(main('FC2-2314275')) + # print(main('EBOD-646')) + print(main('LOVE-262')) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py new file mode 100644 index 0000000..11142fc --- /dev/null +++ b/WebCrawler/storyline.py @@ -0,0 +1,270 @@ +import sys +sys.path.append('../') +import re +import json +from ADC_function import * +from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline +from multiprocessing import Pool +from difflib import SequenceMatcher +from unicodedata import category + +G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"} + + +# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 +def getStoryline(number, title): + start_time = time.time() + conf = config.getInstance() + debug = conf.debug() or conf.storyline_show() == 2 + storyine_sites = conf.storyline_site().split(',') + apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] + mp_args = ((site, number, title, debug) for site in apply_sites) + # choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/ + with Pool() as proc_pool: + result = proc_pool.map(getStoryline_mp, mp_args) + if not debug and conf.storyline_show() == 0: + for value in result: + if isinstance(value, str) and len(value): + return value + return '' + # 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示 + cnt = len(apply_sites) + s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' + first = True + sel = '' + for i in range(cnt): + sl = len(result[i])if isinstance(result[i], str) else 0 + if sl and first: + s += f',[选中结果{apply_sites[i]}字数:{sl}]' + first = False + sel = result[i] + elif sl: + s += f',{apply_sites[i]}字数:{sl}' + else: + s += f',{apply_sites[i]}:空' + print(s) + return sel + + +def getStoryline_mp(args): + return _getStoryline_mp(*args) + + +# 注:新进程的print()不会写入日志中,将来调试修复失效数据源需直接查看标准输出,issue信息需截图屏幕 +def _getStoryline_mp(site, number, title, debug): + start_time = time.time() + storyline = None + if not isinstance(site, str): + return storyline + elif site == "airav": + storyline = getStoryline_airav(number, debug) + elif site == "avno1": + storyline = getStoryline_avno1(number, debug) + elif site == "xcity": + storyline = getStoryline_xcity(number, debug) + elif site == "amazon": + storyline = getStoryline_amazon(title, number, debug) + if not debug: + return storyline + print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( + site, + time.time() - start_time, + time.strftime("%H:%M:%S"), + storyline if isinstance(storyline, str) and len(storyline) else '[空]') + ) + return storyline + + +def getStoryline_airav(number, debug): + try: + number_up = number + site = secrets.choice(('airav.cc','airav4.club')) + url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' + res, browser = get_html_by_browser(url, return_type='browser') + if not res.ok: + raise ValueError(f"get_html_by_browser('{url}') failed") + avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div') + if number_up not in avs.select_one('a > h3').text.upper(): + raise ValueError("number not found") + detail_url = avs.select_one('a')['href'] + res = browser.open_relative(detail_url) + if not res.ok: + raise ValueError(f"browser.open_relative('{detail_url}') failed") + t = browser.page.select_one('head > title').text + airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper() + if number.upper() != airav_number: + raise ValueError(f"page number ->[{airav_number}] not match") + desc = browser.page.select_one('li.introduction > span').text.strip() + return desc + except Exception as e: + if debug: + print(f"[-]MP getOutline_amazon Error: {e},number [{number}].") + pass + return None + + +def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 + try: + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), + '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 + number_up = number.upper() + result, browser = get_html_by_form(url, + form_select='div.wrapper > div.header > div.search > form', + fields = {'kw' : number_up}, + return_type = 'browser') + if not result.ok: + raise ValueError(f"get_html_by_form('{url}','{number_up}') failed") + title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() + page_number = title[title.rfind(' '):].upper() + if not number_up in page_number: + raise ValueError(f"page number ->[{page_number}] not match") + return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 + try: + #xcity_number = number.replace('-','') + query_result, browser = get_html_by_form( + 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), + fields = {'q' : xcity_number.lower()}, + return_type = 'browser') + if not query_result or not query_result.ok: + raise ValueError("page not found") + result = browser.follow_link(browser.links('avod\/detail')[0]) + if not result.ok: + raise ValueError("detail page not found") + return browser.page.select_one('h2.title-detail + p.lead').text.strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_amazon(q_title, number, debug): + if not isinstance(q_title, str) or not len(q_title): + return None + try: + amazon_cookie, _ = load_cookies('amazon.json') + cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None + url = "https://www.amazon.co.jp/s?k=" + q_title + res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser') + if not res.ok: + raise ValueError("get_html_by_browser() failed") + lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') + if isinstance(lks, list) and len(lks): + browser.follow_link(lks[0]) + cookie = None + html = etree.fromstring(str(browser.page), etree.HTMLParser()) + titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") + urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") + if not len(urls) or len(urls) != len(titles): + raise ValueError("titles not found") + idx = amazon_select_one(titles, q_title, number, debug) + if not isinstance(idx, int) or idx < 0: + raise ValueError("title and number not found") + furl = urls[idx] + r = browser.open_relative(furl) + if not r.ok: + raise ValueError("browser.open_relative()) failed.") + lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') + if isinstance(lks, list) and len(lks): + browser.follow_link(lks[0]) + cookie = None + + ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip() + ama_t = re.sub(r'審査番号:\d+', '', ama_t) + + if cookie is None: + # 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径 + ama_save = Path.home() / ".local/share/avdc/amazon.json" + ama_save.parent.mkdir(parents=True, exist_ok=True) + ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') + + return ama_t + + except Exception as e: + if debug: + print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}') + pass + return None + +# 查货架中DVD和蓝光商品中标题相似度高的 +def amazon_select_one(a_titles, q_title, number, debug): + sel = -1 + ratio = 0 + que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A)) + for loc in range(len(a_titles)): + t = a_titles[loc] + if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 + ratio = 1.0 + sel = loc + save_t_ = t + break + if not re.search('DVD|Blu-ray', t, re.I): + continue + ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I)) + ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A)) + findlen = 0 + lastpos = -1 + cnt = len(ama_t) + for c in reversed(ama_t): + cnt -= 1 + pos = que_t.rfind(c) + if lastpos >= 0: + pos_near = que_t[:lastpos].rfind(c) + if pos_near < 0: + findlen = 0 + lastpos = -1 + ama_t = ama_t[:cnt+1] + else: + pos = pos_near + if pos < 0: + if category(c) == 'Nd': + return -1 + ama_t = ama_t[:cnt] + findlen = 0 + lastpos = -1 + continue + if findlen > 0 and len(que_t) > 1 and lastpos == pos+1: + findlen += 1 + lastpos = pos + if findlen >= 4: + break + continue + findlen = 1 + lastpos = pos + if findlen==0: + return -1 + r = SequenceMatcher(None, ama_t, que_t).ratio() + if r > ratio: + sel = loc + ratio = r + save_t_ = ama_t + if ratio > 0.999: + break + + if ratio < 0.5: + return -1 + + if not debug: + # 目前采信相似度高于0.9的结果 + return sel if ratio >= 0.9 else -1 + + # debug 模式下记录识别准确率日志 + if ratio < 0.9: + # 相似度[0.5, 0.9)的淘汰结果单独记录日志 + (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write( + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}') + return -1 + # 被采信的结果日志 + (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write( + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}') + return sel diff --git a/config.ini b/config.ini index 06eda0c..5125ad3 100755 --- a/config.ini +++ b/config.ini @@ -7,7 +7,7 @@ soft_link=0 failed_move=1 auto_exit=0 transalte_to_sc=0 -multi_threading=1 +multi_threading=0 ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) actor_gender=female del_empty_folder=1 @@ -85,3 +85,12 @@ water=2 switch=0 extrafanart_folder=extrafanart +; 剧情简介 +[storyline] +; website为javbus或javdb时,site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询,取值优先级 +; 从左到右,靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介,xcity和amazon是日语的,由 +; 于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。 +; site= +site=airav,avno1,xcity,amazon +; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 +show_result=0 diff --git a/config.py b/config.py index 3b325d9..3226a55 100644 --- a/config.py +++ b/config.py @@ -240,6 +240,20 @@ class Config: def debug(self) -> bool: return self.getboolean_override("debug_mode", "switch") + def storyline_site(self) -> str: + try: + return self.conf.get("storyline", "site") + except: + return "airav,avno1,xcity,amazon" + + def storyline_show(self) -> int: + try: + v = self.conf.getint("storyline", "show_result") + return v if v in (0,1,2) else 2 if v > 2 else 0 + except: + return 0 + + @staticmethod def _exit(sec: str) -> None: print("[-] Read config error! Please check the {} section in config.ini", sec) @@ -333,6 +347,11 @@ class Config: conf.set(sec13, "switch", 1) conf.set(sec13, "extrafanart_folder", "extrafanart") + sec14 = "storyline" + conf.add_section(sec14) + conf.set(sec14, "site", "airav,avno1,xcity,amazon") + conf.set(sec14, "show_result", 0) + return conf