javbus, javlib use outline in airav

2021-04-22 03:22:24 +08:00
parent 98c8585327
commit f761e5bccc
3 changed files with 25 additions and 21 deletions
--- a/WebCrawler/airav.py
+++ b/WebCrawler/airav.py
@@ -225,8 +225,8 @@ def main(number):
 if __name__ == '__main__':
    #print(main('ADN-188'))

-    print(search('ADN-188'))
-    print(search('012717_472'))
-    print(search('080719-976'))
-    print(search('姫川ゆうな'))
+    print(main('ADN-188'))
+    print(main('012717_472'))
+    print(main('080719-976'))
+    print(main('姫川ゆうな'))

--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -7,6 +7,7 @@ from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
 from WebCrawler import fanza
+import airav

 def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
    soup = BeautifulSoup(htmlcode, 'lxml')
@@ -79,12 +80,13 @@ def getCID(htmlcode):
    string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
    result = re.sub('/.*?.jpg','',string)
    return result
-def getOutline(htmlcode):  #获取演员
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getOutline(number):  #获取演员
    try:
-        result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','')
+        response = json.loads(airav.main(number))
+        result = response['outline']
        return result
-    except:
+    except Exception as e:
+        print(e)
        return ''
 def getSerise(htmlcode):   #获取系列 已修改
    html = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -122,15 +124,11 @@ def main_uncensored(number):
    htmlcode = get_html('https://www.javbus.com/ja/' + number)
    if getTitle(htmlcode) == '':
        htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
-    try:
-        dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
-    except:
-        dww_htmlcode = ''
    dic = {
        'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
        'studio': getStudio(htmlcode),
        'year': getYear(htmlcode),
-        'outline': getOutline(dww_htmlcode),
+        'outline': getOutline(number),
        'runtime': getRuntime(htmlcode),
        'director': getDirector(htmlcode),
        'actor': getActor(htmlcode),
@@ -157,15 +155,11 @@ def main(number):
                htmlcode = get_html('https://www.fanbus.us/' + number)
            except:
                htmlcode = get_html('https://www.javbus.com/' + number)
-            try:
-                dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode))
-            except:
-                dww_htmlcode = ''
            dic = {
                'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
                'studio': getStudio(htmlcode),
                'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
-                'outline': getOutline(dww_htmlcode),
+                'outline': getOutline(number),
                'runtime': getRuntime(htmlcode),
                'director': getDirector(htmlcode),
                'actor': getActor(htmlcode),
--- a/WebCrawler/javlib.py
+++ b/WebCrawler/javlib.py
@@ -3,6 +3,7 @@ sys.path.append('../')
 import json
 import bs4
 import re
+import airav
 from bs4 import BeautifulSoup
 from lxml import html
 from http.cookies import SimpleCookie
@@ -42,7 +43,7 @@ def main(number: str):
            "title": get_title(lx, soup),
            "studio": get_table_el_single_anchor(soup, "video_maker"),
            "year": get_table_el_td(soup, "video_date")[:4],
-            "outline": "",
+            "outline": get_outline(number),
            "director": get_table_el_single_anchor(soup, "video_director"),
            "cover": get_cover(lx),
            "imagecut": 1,
@@ -77,7 +78,7 @@ def main(number: str):
            "title": get_title(lx, soup),
            "studio": get_table_el_single_anchor(soup, "video_maker"),
            "year": get_table_el_td(soup, "video_date")[:4],
-            "outline": "",
+            "outline": get_outline(number),
            "director": get_table_el_single_anchor(soup, "video_director"),
            "cover": get_cover(lx),
            "imagecut": 1,
@@ -102,6 +103,15 @@ def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
    return lx.xpath(xpath)[0].strip()


+def get_outline(number):
+    try:
+        response = json.loads(airav.main(number))
+        result = response['outline']
+        return result
+    except:
+        return ''
+
+
 def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
    tag = soup.find(id=tag_id).find("a")

@@ -145,7 +155,7 @@ def get_cover(lx: html.HtmlComment) -> str:


 if __name__ == "__main__":
-    lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
+    lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"]
    #lists = ["DVMC-003"]
    for num in lists:
        print(main(num))