diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index 44f9094..039fed0 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str:
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+ replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
+ replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&', '&'))
diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
index c1a25d9..3e583df 100755
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -4,26 +4,29 @@ import json
from lxml import html
import re
from ADC_function import *
+from WebCrawler.storyline import getStoryline
def main(number: str) -> json:
try:
- carib_obj, browser = get_html_by_browser(
- 'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
- return_type="browser")
-
- if not carib_obj or not carib_obj.ok:
+ # 因演员图片功能还未使用,为提速暂时注释,改为用get_html()
+ #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
+ # return_type='browser')
+ #if not r.ok:
+ # raise ValueError("page not found")
+ #htmlcode = str(browser.page)
+ htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
+ htmlcode = htmlbyte.decode('euc-jp')
+ if not htmlcode or '
404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found")
- lx = html.fromstring(str(browser.page))
-
- if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
- raise ValueError("page info not found")
+ lx = html.fromstring(htmlcode)
+ title = get_title(lx)
dic = {
- 'title': get_title(lx),
+ 'title': title,
'studio': '加勒比',
'year': get_year(lx),
- 'outline': get_outline(lx),
+ 'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx),
'director': '',
'actor': get_actor(lx),
@@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str:
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
-def get_outline(lx: html.HtmlElement) -> str:
- return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
+ o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+
+ storyline_site = config.getInstance().storyline_site().split(',')
+ a = set(storyline_site) & {'airav', 'avno1'}
+ if len(a):
+ site = [n for n in storyline_site if n in a]
+ g = getStoryline(number, title, site)
+ if len(g):
+ return g
+ return o
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 693f404..5c2b91a 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -23,11 +23,11 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
-def getStoryline(number, title):
+def getStoryline(number, title, sites: list=None):
start_time = time.time()
conf = config.getInstance()
debug = conf.debug() or conf.storyline_show() == 2
- storyine_sites = conf.storyline_site().split(',')
+ storyine_sites = conf.storyline_site().split(',') if sites is None else sites
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count())