Parall query on storyline data

2021-10-17 21:59:08 +08:00
parent b006aee34d
commit a546c4e83e
7 changed files with 336 additions and 53 deletions
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -6,6 +6,7 @@ from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
+from WebCrawler.storyline import getStoryline
 import inspect

 def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
@@ -91,33 +92,8 @@ def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时
    except:
        pass
    return ''
-def getOutline(number):  #获取剧情介绍 从avno1.cc取得
-    try:
-        url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
-                secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
-                '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
-        ]) # 随机选一个，避免网站httpd日志中单个ip的请求太过单一
-        number_up = number.upper()
-        result, browser = get_html_by_form(url,
-            form_select='div.wrapper > div.header > div.search > form',
-            fields = {'kw' : number_up},
-            return_type = 'browser')
-        if not result.ok:
-            raise
-        title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
-        page_number = title[title.rfind(' '):].upper()
-        if not number_up in page_number:
-            raise
-        return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
-    except:
-        pass
-    try:
-        from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
-        detail_html, browser = open_by_browser(number)
-        return xcity_getOutline(detail_html)
-    except:
-        pass
-    return ''
+def getOutline(number, title):  #获取剧情介绍 多进程并发查询
+    return getStoryline(number,title)
 def getSerise(htmlcode):   #获取系列 已修改
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    # 如果记录中冇导演，系列排在第6位
@@ -156,11 +132,12 @@ def main_uncensored(number):
        htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
    if "<title>404 Page Not Found" in htmlcode:
        raise Exception('404 page not found')
+    title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','')
    dic = {
-        'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
+        'title': title,
        'studio': getStudio(htmlcode),
        'year': getYear(htmlcode),
-        'outline': getOutline(number),
+        'outline': getOutline(number, title),
        'runtime': getRuntime(htmlcode),
        'director': getDirector(htmlcode),
        'actor': getActor(htmlcode),
@@ -189,11 +166,12 @@ def main(number):
                htmlcode = get_html('https://www.javbus.com/' + number)
            if "<title>404 Page Not Found" in htmlcode:
                raise Exception('404 page not found')
+            title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode)))
            dic = {
-                'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
+                'title': title,
                'studio': getStudio(htmlcode),
                'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
-                'outline': getOutline(number),
+                'outline': getOutline(number, title),
                'runtime': getRuntime(htmlcode),
                'director': getDirector(htmlcode),
                'actor': getActor(htmlcode),
@@ -225,7 +203,11 @@ def main(number):
        return js

 if __name__ == "__main__" :
-    #print(main('ADV-R0624'))    # 404
+    config.G_conf_override['debug_mode:switch'] = True
+    print(main('ABP-888'))
+    print(main('ABP-960'))
+    # print(main('ADV-R0624'))    # 404
+    # print(main('MMNT-010'))
    print(main('ipx-292'))
    print(main('CEMD-011'))
    print(main('CJOD-278'))