carib.py: 尝试获取中文剧情介绍

2021-10-19 15:14:15 +08:00
parent aae4df73fa
commit daf7f5e0a0
3 changed files with 29 additions and 15 deletions
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str:
                replace('<', 'ᐸ').       # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
                replace('>', 'ᐳ').       # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
                replace('|', 'ǀ').       # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
                replace('&lsquo;', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
                replace('&rsquo;', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
                replace('&amp;', '＆'))
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -4,26 +4,29 @@ import json
 from lxml import html
 import re
 from ADC_function import *
 from WebCrawler.storyline import getStoryline
 def main(number: str) -> json:
    try:
-        carib_obj, browser = get_html_by_browser(
+        # 因演员图片功能还未使用，为提速暂时注释，改为用get_html()
-            'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
+        #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
-            return_type="browser")
+        #                return_type='browser')
-
+        #if not r.ok:
-        if not carib_obj or not carib_obj.ok:
+        #    raise ValueError("page not found")
        #htmlcode = str(browser.page)
        htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
        htmlcode = htmlbyte.decode('euc-jp')
        if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
            raise ValueError("page not found")
-        lx = html.fromstring(str(browser.page))
+        lx = html.fromstring(htmlcode)
-
+        title = get_title(lx)
        if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
            raise ValueError("page info not found")
        dic = {
-            'title': get_title(lx),
+            'title': title,
            'studio': '加勒比',
            'year': get_year(lx),
-            'outline': get_outline(lx),
+            'outline': get_outline(lx, number, title),
            'runtime': get_runtime(lx),
            'director': '',
            'actor': get_actor(lx),
@@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str:
 def get_year(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
-def get_outline(lx: html.HtmlElement) -> str:
+def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
-    return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+    o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
    storyline_site = config.getInstance().storyline_site().split(',')
    a = set(storyline_site) & {'airav', 'avno1'}
    if len(a):
        site = [n for n in storyline_site if n in a]
        g = getStoryline(number, title, site)
        if len(g):
            return g
    return o
 def get_release(lx: html.HtmlElement) -> str:
    return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -23,11 +23,11 @@ class noThread(object):
 # 获取剧情介绍 从列表中的站点同时查，取值优先级从前到后
-def getStoryline(number, title):
+def getStoryline(number, title, sites: list=None):
    start_time = time.time()
    conf = config.getInstance()
    debug = conf.debug() or conf.storyline_show() == 2
-    storyine_sites = conf.storyline_site().split(',')
+    storyine_sites = conf.storyline_site().split(',') if sites is None else sites
    apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
    mp_args = ((site, number, title, debug) for site in apply_sites)
    cores = min(len(apply_sites), os.cpu_count())