Files
AV_Data_Capture/WebCrawler/carib.py

137 lines
4.8 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sys
sys.path.append('../')
import json
from lxml import html
import re
from ADC_function import *
from WebCrawler.storyline import getStoryline
def main(number: str) -> json:
try:
# 因演员图片功能还未使用为提速暂时注释改为用get_html()
#r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
# return_type='browser')
#if not r.ok:
# raise ValueError("page not found")
#htmlcode = str(browser.page)
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
htmlcode = htmlbyte.decode('euc-jp')
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found")
lx = html.fromstring(htmlcode)
title = get_title(lx)
dic = {
'title': title,
'studio': '加勒比',
'year': get_year(lx),
'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx),
'director': '',
'actor': get_actor(lx),
'release': get_release(lx),
'number': number,
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
'tag': get_tag(lx),
'extrafanart': get_extrafanart(lx),
'label': get_series(lx),
'imagecut': 1,
# 'actor_photo': get_actor_photo(browser),
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
'source': 'carib.py',
'series': get_series(lx),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title)
if len(g):
return g
return o
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
def get_actor(lx: html.HtmlElement):
r = []
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
for act in actors:
if str(act) != '':
r.append(act)
return r
def get_tag(lx: html.HtmlElement) -> str:
r = []
genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
for g in genres:
r.append(translateTag_to_sc(str(g)))
return r
def get_extrafanart(lx: html.HtmlElement) -> str:
r = []
genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
for g in genres:
jpg = str(g)
if '/member/' in jpg:
break
else:
r.append('https://www.caribbeancom.com' + jpg)
return r
def get_series(lx: html.HtmlElement) -> str:
try:
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
except:
return ''
def get_runtime(lx: html.HtmlElement) -> str:
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
def get_actor_photo(browser):
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
t = {}
for a in htmla:
if a.text.strip() == '':
continue
p = {a.text.strip(): a['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = browser.open_relative(v)
if not r.ok:
continue
html = browser.page.prettify()
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
p = {k: urljoin(browser.url, cssBGjpgs[0])}
o.update(p)
return o
if __name__ == "__main__":
print(main("070116-197")) # actor have photo
print(main("041721-001"))
print(main("080520-001"))