import sys sys.path.append('../') import re from lxml import etree#need install import json from ADC_function import * from WebCrawler.storyline import getStoryline import inspect def getActorPhoto(html): actors = html.xpath('//div[@class="star-name"]/a') d={} for i in actors: url=i.attrib['href'] t=i.attrib['title'] html = etree.fromstring(get_html(url), etree.HTMLParser()) p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) return d def getTitle(html): #获取标题 title = str(html.xpath('/html/head/title/text()')[0]) title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip() return title def getStudioJa(html): x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()') return str(x[0]) if len(x) else '' def getStudio(html): #获取厂商 x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()') return str(x[0]) if len(x) else '' def getYear(html): #获取年份 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip() return result[:4] if len(result)>=len('2000-01-01') else '' def getCover(html): #获取封面链接 image = str(html.xpath('//a[@class="bigImage"]/@href')[0]) return urljoin("https://www.javbus.com", image) def getRelease(html): #获取出版日期 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") return result def getRuntime(html): #获取分钟 已修改 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") return result def getActor(html): #获取女优 b=[] actors = html.xpath('//div[@class="star-name"]/a') for i in actors: b.append(i.attrib['title']) return b def getNum(html): #获取番号 kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') return kwdlist[0] def getDirectorJa(html): x = html.xpath('//span[contains(text(),"監督:")]/../a/text()') return str(x[0]) if len(x) else '' def getDirector(html): #获取导演 x = html.xpath('//span[contains(text(),"導演:")]/../a/text()') return str(x[0]) if len(x) else '' def getCID(html): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result def getOutline(number, title): #获取剧情介绍 多进程并发查询 if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 return getStoryline(number,title) def getSeriseJa(html): x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') return str(x[0]) if len(x) else '' def getSerise(html): #获取系列 x = html.xpath('//span[contains(text(),"系列:")]/../a/text()') return str(x[0]) if len(x) else '' def getTag(html): # 获取标签 klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') return klist[1:] def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'