import sys sys.path.append('../') import json from lxml import html import re from ADC_function import * from WebCrawler.storyline import getStoryline def main(number: str) -> json: try: # 因演员图片功能还未使用,为提速暂时注释,改为用get_html() #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html', # return_type='browser') #if not r.ok: # raise ValueError("page not found") #htmlcode = str(browser.page) htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content') htmlcode = htmlbyte.decode('euc-jp') if not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode: raise ValueError("page not found") lx = html.fromstring(htmlcode) title = get_title(lx) dic = { 'title': title, 'studio': '加勒比', 'year': get_year(lx), 'outline': get_outline(lx, number, title), 'runtime': get_runtime(lx), 'director': '', 'actor': get_actor(lx), 'release': get_release(lx), 'number': number, 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', 'tag': get_tag(lx), 'extrafanart': get_extrafanart(lx), 'label': get_series(lx), 'imagecut': 1, # 'actor_photo': get_actor_photo(browser), 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', 'source': 'carib.py', 'series': get_series(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) return js except Exception as e: if config.getInstance().debug(): print(e) dic = {"title": ""} return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) def get_title(lx: html.HtmlElement) -> str: return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip() def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() storyline_site = config.getInstance().storyline_site().split(',') a = set(storyline_site) & {'airav', 'avno1'} if len(a): site = [n for n in storyline_site if n in a] g = getStoryline(number, title, site) if len(g): return g return o def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') def get_actor(lx: html.HtmlElement): r = [] actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()") for act in actors: if str(act) != '他': r.append(act) return r def get_tag(lx: html.HtmlElement) -> str: r = [] genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()") for g in genres: r.append(translateTag_to_sc(str(g))) return r def get_extrafanart(lx: html.HtmlElement) -> str: r = [] genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href") for g in genres: jpg = str(g) if '/member/' in jpg: break else: r.append('https://www.caribbeancom.com' + jpg) return r def get_series(lx: html.HtmlElement) -> str: try: return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip() except: return '' def get_runtime(lx: html.HtmlElement) -> str: return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() def get_actor_photo(browser): htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a') t = {} for a in htmla: if a.text.strip() == '他': continue p = {a.text.strip(): a['href']} t.update(p) o = {} for k, v in t.items(): if '/search_act/' not in v: continue r = browser.open_relative(v) if not r.ok: continue html = browser.page.prettify() pos = html.find('.full-bg') if pos<0: continue css = html[pos:pos+100] cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) if not cssBGjpgs or not len(cssBGjpgs[0]): continue p = {k: urljoin(browser.url, cssBGjpgs[0])} o.update(p) return o if __name__ == "__main__": print(main("070116-197")) # actor have photo print(main("041721-001")) print(main("080520-001"))