carib.py: add outline/series/actor_photo

2021-09-23 15:45:00 +08:00
parent 5e0e8b9cea
commit 50574a705b
2 changed files with 70 additions and 13 deletions
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -84,6 +84,25 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
    print("[-]" + errors)


+def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
+    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
+    configProxy = config.Config().proxy()
+    if configProxy.enable:
+        browser.session.proxies = configProxy.proxies()
+    result = browser.open(url)
+    if not result.ok:
+        return ''
+    result.encoding = "utf-8"
+    if return_type == "object":
+        return result
+    elif return_type == "content":
+        return result.content
+    elif return_type == "browser":
+        return result, browser
+    else:
+        return result.text
+
+
 def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
    if isinstance(cookies, dict):
@@ -592,4 +611,3 @@ def is_link(filename: str):
    elif os.stat(filename).st_nlink > 1:
        return True # hard link Linux MAC OSX Windows NTFS
    return False
-
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -8,15 +8,16 @@ from ADC_function import *

 def main(number: str) -> json:
    try:
-        caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
-                             return_type="content")
+        caribbytes, browser = get_html_by_browser(
+            'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
+            return_type="browser")

-        caribhtml = caribbytes.decode("euc_jp")
+        if not caribbytes or not caribbytes.ok:
+            raise ValueError("page not found")

-        soup = BeautifulSoup(caribhtml, "html.parser")
-        lx = html.fromstring(str(soup))
+        lx = html.fromstring(str(browser.page))

-        if not soup.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
+        if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
            raise ValueError("page info not found")
    except Exception as e:
        if config.Config().debug():
@@ -27,7 +28,7 @@ def main(number: str) -> json:
        'title': get_title(lx),
        'studio': '加勒比',
        'year': get_year(lx),
-        'outline': '',
+        'outline': get_outline(lx),
        'runtime': get_runtime(lx),
        'director': '',
        'actor': get_actor(lx),
@@ -36,12 +37,12 @@ def main(number: str) -> json:
        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
        'tag': get_tag(lx),
        'extrafanart': get_extrafanart(lx),
-        'label': '',
-        'imagecut': 0,
-        'actor_photo': '',
+        'label': get_series(lx),
+        'imagecut': 1,
+        'actor_photo': get_actor_photo(browser),
        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
        'source': 'carib.py',
-        'series': '',
+        'series': get_series(lx),
    }
    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
    return js
@@ -52,10 +53,13 @@ def get_title(lx: html.HtmlElement) -> str:
 def get_year(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]

+def get_outline(lx: html.HtmlElement) -> str:
+    return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+
 def get_release(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')

-def get_actor(lx: html.HtmlElement) -> str:
+def get_actor(lx: html.HtmlElement):
    r = []
    actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
    for act in actors:
@@ -81,9 +85,44 @@ def get_extrafanart(lx: html.HtmlElement) -> str:
            r.append('https://www.caribbeancom.com' + jpg)
    return r

+def get_series(lx: html.HtmlElement) -> str:
+    try:
+        return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
+    except:
+        return ''
+
 def get_runtime(lx: html.HtmlElement) -> str:
    return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()

+def get_actor_photo(browser):
+    htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
+    t = {}
+    for a in htmla:
+        if a.text.strip() == '他':
+            continue
+        p = {a.text.strip(): a['href']}
+        t.update(p)
+    o = {}
+    for k, v in t.items():
+        if '/search_act/' not in v:
+            continue
+        r = browser.open_relative(v)
+        if not r.ok:
+            continue
+        html = browser.page.prettify()
+        pos = html.find('.full-bg')
+        if pos<0:
+            continue
+        css = html[pos:pos+100]
+        p0 = css.find('background: url(')
+        p1 = css.find('.jpg)')
+        if p0<0 or p1<0:
+            continue
+        p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
+        o.update(p)
+    return o
+
 if __name__ == "__main__":
+    print(main("070116-197")) # actor have photo
    print(main("041721-001"))
    print(main("080520-001"))