replace browser by session in some places

2021-11-01 03:49:35 +08:00
parent 0fe1b2fcac
commit 3b498d32ca
6 changed files with 215 additions and 122 deletions
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -4,7 +4,6 @@ import re
 from lxml import etree
 import json
 from ADC_function import *
-from mechanicalsoup.stateful_browser import StatefulBrowser
 from WebCrawler.storyline import getStoryline
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -30,8 +29,8 @@ def getActor(html):
        idx = idx + 1
    return r

-def getaphoto(url,  browser):
-    html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
+def getaphoto(url, session):
+    html_page = session.get(url).text if isinstance(session, requests.Session) else get_html(url)
    img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
    img_url = img_prether.findall(html_page)
    if img_url:
@@ -39,7 +38,7 @@ def getaphoto(url,  browser):
    else:
        return ''

-def getActorPhoto(html, javdb_site,  browser): #//*[@id="star_qdt"]/li/a/img
+def getActorPhoto(html, javdb_site, session):
    actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
    if not actorall:
        return {}
@@ -47,7 +46,7 @@ def getActorPhoto(html, javdb_site,  browser): #//*[@id="star_qdt"]/li/a/img
    actor_photo = {}
    for i in actorall:
        if i.text in a:
-            actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
+            actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session)
    return actor_photo

 def getStudio(a, html):
@@ -178,15 +177,6 @@ def getDirector(html):
    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
-def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时更名，等无法恢复时删除
-    try:
-        htmlcode = get_html('https://cn.airav.wiki/video/' + number)
-        from WebCrawler.airav import getOutline as airav_getOutline
-        result = airav_getOutline(htmlcode)
-        return result
-    except:
-        pass
-    return ''
 def getOutline(number, title):  #获取剧情介绍 多进程并发查询
    return getStoryline(number,title)
 def getSeries(html):
@@ -224,11 +214,11 @@ def main(number):
            javdb_site = secrets.choice(javdb_sites)
        if debug:
            print(f'[!]javdb:select site {javdb_site}')
-        browser = None
+        session = None
        try:
            javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
-            res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
-            if not res.ok:
+            res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
+            if not res:
                raise
            query_result = res.text
        except:
@@ -251,8 +241,9 @@ def main(number):
                    raise ValueError("number not found")
                correct_url = urls[0]
        try:
-            if isinstance(browser, StatefulBrowser):  # get faster benefit from http keep-alive
-                detail_page = browser.open_relative(correct_url).text
+            if isinstance(session, requests.Session):  # get faster benefit from http keep-alive
+                javdb_detail_url = urljoin(res.url, correct_url)
+                detail_page = session.get(javdb_detail_url).text
            else:
                javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
                detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
@@ -303,8 +294,8 @@ def main(number):
            'tag': getTag(lx),
            'label': getLabel(lx),
            'year': getYear(detail_page),  # str(re.search('\d{4}',getRelease(a)).group()),
-#            'actor_photo': getActorPhoto(lx, javdb_site,  browser),
-            'website': 'https://javdb.com' + correct_url,
+#            'actor_photo': getActorPhoto(lx, javdb_site,  session),
+            'website': urljoin('https://javdb.com', correct_url),
            'source': 'javdb.py',
            'series': getSeries(lx),