WebCrawler:全面换装getInstance()，厘清airav.py与javbus.py及javdb.py的相爱相杀

2021-10-08 11:46:35 +08:00
parent cf072e79d1
commit a405c5c41b
11 changed files with 206 additions and 167 deletions
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -1,51 +1,53 @@
 import sys
 sys.path.append('../')
 import json
-from bs4 import BeautifulSoup
 from lxml import html
 import re
 from ADC_function import *

 def main(number: str) -> json:
    try:
-        caribbytes, browser = get_html_by_browser(
+        carib_obj, browser = get_html_by_browser(
            'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
            return_type="browser")

-        if not caribbytes or not caribbytes.ok:
+        if not carib_obj or not carib_obj.ok:
            raise ValueError("page not found")

        lx = html.fromstring(str(browser.page))

        if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
            raise ValueError("page info not found")
+
+        dic = {
+            'title': get_title(lx),
+            'studio': '加勒比',
+            'year': get_year(lx),
+            'outline': get_outline(lx),
+            'runtime': get_runtime(lx),
+            'director': '',
+            'actor': get_actor(lx),
+            'release': get_release(lx),
+            'number': number,
+            'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
+            'tag': get_tag(lx),
+            'extrafanart': get_extrafanart(lx),
+            'label': get_series(lx),
+            'imagecut': 1,
+#            'actor_photo': get_actor_photo(browser),
+            'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
+            'source': 'carib.py',
+            'series': get_series(lx),
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
+        return js
+
    except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
            print(e)
        dic = {"title": ""}
        return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-    dic = {
-        'title': get_title(lx),
-        'studio': '加勒比',
-        'year': get_year(lx),
-        'outline': get_outline(lx),
-        'runtime': get_runtime(lx),
-        'director': '',
-        'actor': get_actor(lx),
-        'release': get_release(lx),
-        'number': number,
-        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
-        'tag': get_tag(lx),
-        'extrafanart': get_extrafanart(lx),
-        'label': get_series(lx),
-        'imagecut': 1,
-#        'actor_photo': get_actor_photo(browser),
-        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
-        'source': 'carib.py',
-        'series': get_series(lx),
-    }
-    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
-    return js
+

 def get_title(lx: html.HtmlElement) -> str:
    return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
@@ -114,11 +116,10 @@ def get_actor_photo(browser):
        if pos<0:
            continue
        css = html[pos:pos+100]
-        p0 = css.find('background: url(')
-        p1 = css.find('.jpg)')
-        if p0<0 or p1<0:
+        cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
+        if not cssBGjpgs or not len(cssBGjpgs[0]):
            continue
-        p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
+        p = {k: urljoin(browser.url, cssBGjpgs[0])}
        o.update(p)
    return o