import sys sys.path.append('../') import re from lxml import etree import json from ADC_function import * from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) def getTitle(html): result = html.xpath('//*[@id="program_detail_title"]/text()')[0] return result def getActor(browser): htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') t = [] for i in htmla: t.append(i.text.strip()) return t def getActorPhoto(browser): htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') t = {} for i in htmla: p = {i.text.strip(): i['href']} t.update(p) o = {} for k, v in t.items(): r = browser.open_relative(v) if r.ok: pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') p = {k: urljoin(browser.url, pic['src'])} else: p = {k, ''} o.update(p) return o def getStudio(html): try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") except: result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") return result.strip('+').replace("', '", '').replace('"', '') def getRuntime(html): try: x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip() return x except: return '' def getLabel(html): try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] return result except: return '' def getNum(html): try: result = html.xpath('//*[@id="hinban"]/text()')[0] return result except: return '' def getYear(getRelease): try: result = str(re.search('\d{4}', getRelease).group()) return result except: return getRelease def getRelease(html): try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) except: return '' try: return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') except: return '' def getTag(html): x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()') return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else [] def getCover_small(html, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result def getCover(html): try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] return 'https:' + result except: return '' def getDirector(html): try: result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') return result except: return '' def getOutline(html, number, title): storyline_site = config.getInstance().storyline_site().split(',') a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字 if len(a): site = [n for n in storyline_site if n in a] g = getStoryline(number, title, site) if len(g): return g try: x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0] return x.replace(getNum(html), '') except: return '' def getSeries(html): try: try: result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] return result except: result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0] return result except: return '' def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'