137 lines
4.8 KiB
Python
Executable File
137 lines
4.8 KiB
Python
Executable File
import sys
|
||
sys.path.append('../')
|
||
import json
|
||
from lxml import html
|
||
import re
|
||
from ADC_function import *
|
||
from WebCrawler.storyline import getStoryline
|
||
|
||
def main(number: str) -> json:
|
||
try:
|
||
# 因演员图片功能还未使用,为提速暂时注释,改为用get_html()
|
||
#r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
||
# return_type='browser')
|
||
#if not r.ok:
|
||
# raise ValueError("page not found")
|
||
#htmlcode = str(browser.page)
|
||
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
|
||
htmlcode = htmlbyte.decode('euc-jp')
|
||
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
|
||
raise ValueError("page not found")
|
||
|
||
lx = html.fromstring(htmlcode)
|
||
title = get_title(lx)
|
||
|
||
dic = {
|
||
'title': title,
|
||
'studio': '加勒比',
|
||
'year': get_year(lx),
|
||
'outline': get_outline(lx, number, title),
|
||
'runtime': get_runtime(lx),
|
||
'director': '',
|
||
'actor': get_actor(lx),
|
||
'release': get_release(lx),
|
||
'number': number,
|
||
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
||
'tag': get_tag(lx),
|
||
'extrafanart': get_extrafanart(lx),
|
||
'label': get_series(lx),
|
||
'imagecut': 1,
|
||
# 'actor_photo': get_actor_photo(browser),
|
||
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
||
'source': 'carib.py',
|
||
'series': get_series(lx),
|
||
}
|
||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||
return js
|
||
|
||
except Exception as e:
|
||
if config.getInstance().debug():
|
||
print(e)
|
||
dic = {"title": ""}
|
||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
||
|
||
|
||
def get_title(lx: html.HtmlElement) -> str:
|
||
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
|
||
|
||
def get_year(lx: html.HtmlElement) -> str:
|
||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
||
|
||
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
|
||
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
||
g = getStoryline(number, title)
|
||
if len(g):
|
||
return g
|
||
return o
|
||
|
||
def get_release(lx: html.HtmlElement) -> str:
|
||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
||
|
||
def get_actor(lx: html.HtmlElement):
|
||
r = []
|
||
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
|
||
for act in actors:
|
||
if str(act) != '他':
|
||
r.append(act)
|
||
return r
|
||
|
||
def get_tag(lx: html.HtmlElement) -> str:
|
||
r = []
|
||
genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
|
||
for g in genres:
|
||
r.append(translateTag_to_sc(str(g)))
|
||
return r
|
||
|
||
def get_extrafanart(lx: html.HtmlElement) -> str:
|
||
r = []
|
||
genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
|
||
for g in genres:
|
||
jpg = str(g)
|
||
if '/member/' in jpg:
|
||
break
|
||
else:
|
||
r.append('https://www.caribbeancom.com' + jpg)
|
||
return r
|
||
|
||
def get_series(lx: html.HtmlElement) -> str:
|
||
try:
|
||
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
|
||
except:
|
||
return ''
|
||
|
||
def get_runtime(lx: html.HtmlElement) -> str:
|
||
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
||
|
||
def get_actor_photo(browser):
|
||
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
|
||
t = {}
|
||
for a in htmla:
|
||
if a.text.strip() == '他':
|
||
continue
|
||
p = {a.text.strip(): a['href']}
|
||
t.update(p)
|
||
o = {}
|
||
for k, v in t.items():
|
||
if '/search_act/' not in v:
|
||
continue
|
||
r = browser.open_relative(v)
|
||
if not r.ok:
|
||
continue
|
||
html = browser.page.prettify()
|
||
pos = html.find('.full-bg')
|
||
if pos<0:
|
||
continue
|
||
css = html[pos:pos+100]
|
||
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
|
||
if not cssBGjpgs or not len(cssBGjpgs[0]):
|
||
continue
|
||
p = {k: urljoin(browser.url, cssBGjpgs[0])}
|
||
o.update(p)
|
||
return o
|
||
|
||
if __name__ == "__main__":
|
||
print(main("070116-197")) # actor have photo
|
||
print(main("041721-001"))
|
||
print(main("080520-001"))
|