carib.py: 尝试获取中文剧情介绍
This commit is contained in:
@@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str:
|
|||||||
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
|
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
|
||||||
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
|
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
|
||||||
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
|
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
|
||||||
|
replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
|
||||||
|
replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
|
||||||
replace('&', '&'))
|
replace('&', '&'))
|
||||||
|
|||||||
@@ -4,26 +4,29 @@ import json
|
|||||||
from lxml import html
|
from lxml import html
|
||||||
import re
|
import re
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
|
from WebCrawler.storyline import getStoryline
|
||||||
|
|
||||||
def main(number: str) -> json:
|
def main(number: str) -> json:
|
||||||
try:
|
try:
|
||||||
carib_obj, browser = get_html_by_browser(
|
# 因演员图片功能还未使用,为提速暂时注释,改为用get_html()
|
||||||
'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
#r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
||||||
return_type="browser")
|
# return_type='browser')
|
||||||
|
#if not r.ok:
|
||||||
if not carib_obj or not carib_obj.ok:
|
# raise ValueError("page not found")
|
||||||
|
#htmlcode = str(browser.page)
|
||||||
|
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
|
||||||
|
htmlcode = htmlbyte.decode('euc-jp')
|
||||||
|
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
|
||||||
raise ValueError("page not found")
|
raise ValueError("page not found")
|
||||||
|
|
||||||
lx = html.fromstring(str(browser.page))
|
lx = html.fromstring(htmlcode)
|
||||||
|
title = get_title(lx)
|
||||||
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
|
|
||||||
raise ValueError("page info not found")
|
|
||||||
|
|
||||||
dic = {
|
dic = {
|
||||||
'title': get_title(lx),
|
'title': title,
|
||||||
'studio': '加勒比',
|
'studio': '加勒比',
|
||||||
'year': get_year(lx),
|
'year': get_year(lx),
|
||||||
'outline': get_outline(lx),
|
'outline': get_outline(lx, number, title),
|
||||||
'runtime': get_runtime(lx),
|
'runtime': get_runtime(lx),
|
||||||
'director': '',
|
'director': '',
|
||||||
'actor': get_actor(lx),
|
'actor': get_actor(lx),
|
||||||
@@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str:
|
|||||||
def get_year(lx: html.HtmlElement) -> str:
|
def get_year(lx: html.HtmlElement) -> str:
|
||||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
||||||
|
|
||||||
def get_outline(lx: html.HtmlElement) -> str:
|
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
|
||||||
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
||||||
|
|
||||||
|
storyline_site = config.getInstance().storyline_site().split(',')
|
||||||
|
a = set(storyline_site) & {'airav', 'avno1'}
|
||||||
|
if len(a):
|
||||||
|
site = [n for n in storyline_site if n in a]
|
||||||
|
g = getStoryline(number, title, site)
|
||||||
|
if len(g):
|
||||||
|
return g
|
||||||
|
return o
|
||||||
|
|
||||||
def get_release(lx: html.HtmlElement) -> str:
|
def get_release(lx: html.HtmlElement) -> str:
|
||||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
||||||
|
|||||||
@@ -23,11 +23,11 @@ class noThread(object):
|
|||||||
|
|
||||||
|
|
||||||
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
|
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
|
||||||
def getStoryline(number, title):
|
def getStoryline(number, title, sites: list=None):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
debug = conf.debug() or conf.storyline_show() == 2
|
debug = conf.debug() or conf.storyline_show() == 2
|
||||||
storyine_sites = conf.storyline_site().split(',')
|
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
|
||||||
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
|
apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
|
||||||
mp_args = ((site, number, title, debug) for site in apply_sites)
|
mp_args = ((site, number, title, debug) for site in apply_sites)
|
||||||
cores = min(len(apply_sites), os.cpu_count())
|
cores = min(len(apply_sites), os.cpu_count())
|
||||||
|
|||||||
Reference in New Issue
Block a user