import sys sys.path.append('../') from bs4 import BeautifulSoup#need install from ADC_function import * from WebCrawler import javbus ''' API 注册:https://www.airav.wiki/api/auth/signup 设置:https://www.airav.wiki/api/get_web_settings 搜索:https://www.airav.wiki/api/video/list?lng=zh-CN&search= 搜索:https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search= ''' host = 'https://www.airav.wiki' # airav这个网站没有演员图片,所以直接使用javbus的图 def getActorPhoto(javbus_json): result = javbus_json.get('actor_photo') if isinstance(result, dict) and len(result): return result return '' def getTitle(htmlcode): #获取标题 html = etree.fromstring(htmlcode, etree.HTMLParser()) title = str(html.xpath('/html/head/title/text()')[0]) result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() return result def getStudio(htmlcode, javbus_json): #获取厂商 已修改 # javbus如果有数据以它为准 result = javbus_json.get('studio') if isinstance(result, str) and len(result): return result html = etree.fromstring(htmlcode,etree.HTMLParser()) return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']") def getYear(htmlcode, javbus_json): #获取年份 result = javbus_json.get('year') if isinstance(result, str) and len(result): return result release = getRelease(htmlcode, javbus_json) if len(release) != len('2000-01-01'): return '' return release[:4] def getCover(htmlcode, javbus_json): #获取封面图片 result = javbus_json.get('cover') if isinstance(result, str) and len(result): return result html = etree.fromstring(htmlcode, etree.HTMLParser()) return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0] def getRelease(htmlcode, javbus_json): #获取出版日期 result = javbus_json.get('release') if isinstance(result, str) and len(result): return result html = etree.fromstring(htmlcode, etree.HTMLParser()) try: result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group() except: return '' return result def getRuntime(javbus_json): #获取播放时长 result = javbus_json.get('runtime') if isinstance(result, str) and len(result): return result return '' # airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先 def getActor(htmlcode, javbus_json): #获取女优 b=[] html = etree.fromstring(htmlcode, etree.HTMLParser()) a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()') for v in a: v = v.strip() if len(v): b.append(v) if len(b): return b result = javbus_json.get('actor') if isinstance(result, list) and len(result): return result return [] def getNum(htmlcode, javbus_json): #获取番号 result = javbus_json.get('number') if isinstance(result, str) and len(result): return result html = etree.fromstring(htmlcode, etree.HTMLParser()) title = str(html.xpath('/html/head/title/text()')[0]) result = str(re.findall('^\[(.*?)]', title)[0]) return result def getDirector(javbus_json): #获取导演 已修改 result = javbus_json.get('director') if isinstance(result, str) and len(result): return result return '' def getOutline(htmlcode): #获取概述 html = etree.fromstring(htmlcode, etree.HTMLParser()) try: result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip() return result except: return '' def getSerise(javbus_json): #获取系列 已修改 result = javbus_json.get('series') if isinstance(result, str) and len(result): return result return '' def getTag(htmlcode): # 获取标签 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') x = soup.find_all(attrs={'class': 'tagBtnMargin'}) a = x[0].find_all('a') for i in a: tag.append(i.get_text()) return tag def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'