From 0933e87944afabc1cdb18c26b272a60fa4554d33 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 10 Oct 2021 17:41:33 +0800 Subject: [PATCH] fix outline of javbus and javdb which caused by airav down --- ADC_function.py | 4 ++-- WebCrawler/javbus.py | 21 +++++++++++++++++++-- WebCrawler/javdb.py | 6 +++++- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 09fb11d..4480852 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -103,7 +103,7 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: return result.text -def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) if isinstance(cookies, dict): requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) @@ -113,7 +113,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d result = browser.open(url) if not result.ok: return '' - form = browser.select_form() if form_name is None else browser.select_form(form_name) + form = browser.select_form() if form_select is None else browser.select_form(form_select) if isinstance(fields, dict): for k, v in fields.items(): browser[k] = v diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 1af4359..c2ff11e 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -80,7 +80,7 @@ def getCID(htmlcode): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result -def getOutline(number): #获取剧情介绍 +def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 try: @@ -91,6 +91,23 @@ def getOutline(number): #获取剧情介绍 except: pass return '' +def getOutline(number): #获取剧情介绍 从avno1.cc取得 + try: + number_up = number.upper() + result, browser = get_html_by_form('http://www.avno1.cc/cn/usercenter.php?item=pay_support', + form_select='div.wrapper > div.header > div.search > form', + fields = {'kw' : number_up}, + return_type = 'browser') + if not result.ok: + raise + title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() + page_number = title[title.rfind(' '):].upper() + if not number_up in page_number: + raise + return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() + except: + pass + return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 @@ -198,7 +215,7 @@ def main(number): return js if __name__ == "__main__" : - print(main('ADV-R0624')) # 404 + #print(main('ADV-R0624')) # 404 print(main('ipx-292')) print(main('CEMD-011')) print(main('CJOD-278')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 7d69404..358682d 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -196,7 +196,7 @@ def getDirector(a): result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(number): #获取剧情介绍 +def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 try: htmlcode = get_html('https://cn.airav.wiki/video/' + number) from WebCrawler.airav import getOutline as airav_getOutline @@ -205,6 +205,9 @@ def getOutline(number): #获取剧情介绍 except: pass return '' +def getOutline(number): #获取剧情介绍 + from WebCrawler.javbus import getOutline as javbus_getOutline + return javbus_getOutline(number) def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -340,6 +343,7 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) + print(main('070116-197')) print(main('093021_539')) # 没有剧照 片商pacopacomama # print(main('FC2-2278260')) # print(main('FC2-735670'))