carib.py: 尝试获取中文剧情介绍

This commit is contained in:
lededev
2021-10-19 15:14:15 +08:00
parent aae4df73fa
commit daf7f5e0a0
3 changed files with 29 additions and 15 deletions

View File

@@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str:
replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('&lsquo;', ''). # U+02018 LEFT SINGLE QUOTATION MARK
replace('&rsquo;', ''). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&amp;', '')) replace('&amp;', ''))

View File

@@ -4,26 +4,29 @@ import json
from lxml import html from lxml import html
import re import re
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline
def main(number: str) -> json: def main(number: str) -> json:
try: try:
carib_obj, browser = get_html_by_browser( # 因演员图片功能还未使用为提速暂时注释改为用get_html()
'https://www.caribbeancom.com/moviepages/'+number+'/index.html', #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="browser") # return_type='browser')
#if not r.ok:
if not carib_obj or not carib_obj.ok: # raise ValueError("page not found")
#htmlcode = str(browser.page)
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
htmlcode = htmlbyte.decode('euc-jp')
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found") raise ValueError("page not found")
lx = html.fromstring(str(browser.page)) lx = html.fromstring(htmlcode)
title = get_title(lx)
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
raise ValueError("page info not found")
dic = { dic = {
'title': get_title(lx), 'title': title,
'studio': '加勒比', 'studio': '加勒比',
'year': get_year(lx), 'year': get_year(lx),
'outline': get_outline(lx), 'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx), 'runtime': get_runtime(lx),
'director': '', 'director': '',
'actor': get_actor(lx), 'actor': get_actor(lx),
@@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str:
def get_year(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement) -> str: def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
storyline_site = config.getInstance().storyline_site().split(',')
a = set(storyline_site) & {'airav', 'avno1'}
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site)
if len(g):
return g
return o
def get_release(lx: html.HtmlElement) -> str: def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')

View File

@@ -23,11 +23,11 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title): def getStoryline(number, title, sites: list=None):
start_time = time.time() start_time = time.time()
conf = config.getInstance() conf = config.getInstance()
debug = conf.debug() or conf.storyline_show() == 2 debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') storyine_sites = conf.storyline_site().split(',') if sites is None else sites
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
mp_args = ((site, number, title, debug) for site in apply_sites) mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count()) cores = min(len(apply_sites), os.cpu_count())