diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index e1608b6..dc54b46 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -134,6 +134,14 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 print('[-]Movie Number not found!') return None + # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障 + # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z + # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z + # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。 + if str(json_data.get('number')).upper() != file_number.upper(): + print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number')))) + return None + # ================================================网站规则添加结束================================================ title = json_data.get('title') @@ -225,6 +233,8 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 studio = studio.replace('エムズビデオグループ','M’s Video Group') studio = studio.replace('ミニマム','Minimum') studio = studio.replace('ワープエンタテインメント','WAAP Entertainment') + studio = studio.replace('pacopacomama,パコパコママ','pacopacomama') + studio = studio.replace('パコパコママ','pacopacomama') studio = re.sub('.*/妄想族','妄想族',studio) studio = studio.replace('/',' ') # === 替换Studio片假名 END diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index 5925421..f7b144c 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -6,6 +6,7 @@ from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * +from WebCrawler import javbus ''' API @@ -17,95 +18,94 @@ API host = 'https://www.airav.wiki' # airav这个网站没有演员图片,所以直接使用javbus的图 -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) - d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=urljoin("https://www.javbus.com", - str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) - p2={t:p} - d.update(p2) - return d +def getActorPhoto(javbus_json): + result = javbus_json.get('actor_photo') + if isinstance(result, dict) and len(result): + return result + return '' def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - # h5:first-child定位第一个h5标签,妈的找了好久才找到这个语法 - title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-') - try: - title2 = re.sub('n\d+-','',title) + html = etree.fromstring(htmlcode, etree.HTMLParser()) + title = str(html.xpath('/html/head/title/text()')[0]) + result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() + return result - return title2 +def getStudio(htmlcode, javbus_json): #获取厂商 已修改 + # javbus如果有数据以它为准 + result = javbus_json.get('studio') + if isinstance(result, str) and len(result): + return result + html = etree.fromstring(htmlcode,etree.HTMLParser()) + return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']") +def getYear(htmlcode, javbus_json): #获取年份 + result = javbus_json.get('year') + if isinstance(result, str) and len(result): + return result + release = getRelease(htmlcode, javbus_json) + if len(release) != len('2000-01-01'): + return '' + return release[:4] +def getCover(htmlcode, javbus_json): #获取封面图片 + result = javbus_json.get('cover') + if isinstance(result, str) and len(result): + return result + html = etree.fromstring(htmlcode, etree.HTMLParser()) + return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0] +def getRelease(htmlcode, javbus_json): #获取出版日期 + result = javbus_json.get('release') + if isinstance(result, str) and len(result): + return result + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group() except: - return title - -def getStudio(htmlcode): #获取厂商 已修改 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - # 如果记录中冇导演,厂商排在第4位 - if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - # 如果记录中有导演,厂商排在第5位 - elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - else: - result = '' + return '' return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return urljoin("https://www.javbus.com", image.attr('href')) -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") - return result -def getActor(htmlcode): #获取女优 +def getRuntime(javbus_json): #获取播放时长 + result = javbus_json.get('runtime') + if isinstance(result, str) and len(result): + return result + return '' +# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先 +def getActor(htmlcode, javbus_json): #获取女优 b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) - return b -def getNum(htmlcode): #获取番号 html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 已修改 + a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()') + for v in a: + v = v.strip() + if len(v): + b.append(v) + if len(b): + return b + result = javbus_json.get('actor') + if isinstance(result, list) and len(result): + return result + return [] +def getNum(htmlcode, javbus_json): #获取番号 + result = javbus_json.get('number') + if isinstance(result, str) and len(result): + return result html = etree.fromstring(htmlcode, etree.HTMLParser()) - if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - else: - result = '' # 记录中有可能没有导演数据 + title = str(html.xpath('/html/head/title/text()')[0]) + result = str(re.findall('^\[(.*?)]', title)[0]) return result - -def getOutline(htmlcode): #获取演员 +def getDirector(javbus_json): #获取导演 已修改 + result = javbus_json.get('director') + if isinstance(result, str) and len(result): + return result + return '' +def getOutline(htmlcode): #获取概述 html = etree.fromstring(htmlcode, etree.HTMLParser()) try: - result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','') + result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip() return result except: return '' -def getSerise(htmlcode): #获取系列 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - # 如果记录中冇导演,系列排在第6位 - if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") - # 如果记录中有导演,系列排在第7位 - elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - else: - result = '' - return result +def getSerise(javbus_json): #获取系列 已修改 + result = javbus_json.get('series') + if isinstance(result, str) and len(result): + return result + return '' def getTag(htmlcode): # 获取标签 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') @@ -169,52 +169,50 @@ def main(number): try: try: htmlcode = get_html('https://cn.airav.wiki/video/' + number) - javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number) - + javbus_json = json.loads(javbus.main(number)) except: print(number) dic = { # 标题可使用airav - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), - # 制作商选择使用javbus - 'studio': getStudio(javbus_htmlcode), - # 年份也是用javbus - 'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()), + 'title': getTitle(htmlcode), + # 制作商先找javbus,如果没有再找本站 + 'studio': getStudio(htmlcode, javbus_json), + # 年份先试javbus,如果没有再找本站 + 'year': getYear(htmlcode, javbus_json), # 简介 使用 airav 'outline': getOutline(htmlcode), # 使用javbus - 'runtime': getRuntime(javbus_htmlcode), + 'runtime': getRuntime(javbus_json), # 导演 使用javbus - 'director': getDirector(javbus_htmlcode), - # 作者 使用airav - 'actor': getActor(javbus_htmlcode), - # 发售日使用javbus - 'release': getRelease(javbus_htmlcode), + 'director': getDirector(javbus_json), + # 演员 先试airav + 'actor': getActor(htmlcode, javbus_json), + # 发售日先试javbus + 'release': getRelease(htmlcode, javbus_json), # 番号使用javbus - 'number': getNum(javbus_htmlcode), + 'number': getNum(htmlcode, javbus_json), # 封面链接 使用javbus - 'cover': getCover(javbus_htmlcode), + 'cover': getCover(htmlcode, javbus_json), # 剧照获取 'extrafanart': getExtrafanart(htmlcode), 'imagecut': 1, # 使用 airav 'tag': getTag(htmlcode), # 使用javbus - 'label': getSerise(javbus_htmlcode), + 'label': getSerise(javbus_json), # 妈的,airav不提供作者图片 - 'actor_photo': getActorPhoto(javbus_htmlcode), - +# 'actor_photo': getActorPhoto(javbus_json), 'website': 'https://www.airav.wiki/video/' + number, 'source': 'airav.py', # 使用javbus - 'series': getSerise(javbus_htmlcode), + 'series': getSerise(javbus_json) } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') return js except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", @@ -226,6 +224,6 @@ def main(number): if __name__ == '__main__': - #print(main('ADN-188')) - print(main('ADN-188')) - print(main('CJOD-278')) + print(main('ADV-R0624')) # javbus页面返回404, airav有数据 + print(main('ADN-188')) # 一人 + print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字 diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 254f3e8..293769a 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -100,6 +100,9 @@ def main(number): soup = BeautifulSoup(web, 'lxml') info = str(soup.find(attrs={'class': 'row movie'})) try: + new_number = getNum(info) + if new_number.upper() != number.upper(): + raise ValueError('number not found') dic = { 'actor': getActor(web), 'title': getTitle(web).strip(getNum(web)), @@ -108,7 +111,7 @@ def main(number): 'runtime': getRuntime(info), 'director': '', # 'release': getRelease(info), - 'number': getNum(info), + 'number': new_number, 'cover': getCover(web), 'cover_small': getCover_small(a), 'imagecut': 3, @@ -121,7 +124,7 @@ def main(number): 'series': getSeries(info), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -129,3 +132,4 @@ def main(number): if __name__ == "__main__": print(main('012717_472')) + print(main('1')) # got fake result raise 'number not found' diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index 8eee1af..c1a25d9 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -1,51 +1,53 @@ import sys sys.path.append('../') import json -from bs4 import BeautifulSoup from lxml import html import re from ADC_function import * def main(number: str) -> json: try: - caribbytes, browser = get_html_by_browser( + carib_obj, browser = get_html_by_browser( 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type="browser") - if not caribbytes or not caribbytes.ok: + if not carib_obj or not carib_obj.ok: raise ValueError("page not found") lx = html.fromstring(str(browser.page)) if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): raise ValueError("page info not found") + + dic = { + 'title': get_title(lx), + 'studio': '加勒比', + 'year': get_year(lx), + 'outline': get_outline(lx), + 'runtime': get_runtime(lx), + 'director': '', + 'actor': get_actor(lx), + 'release': get_release(lx), + 'number': number, + 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', + 'tag': get_tag(lx), + 'extrafanart': get_extrafanart(lx), + 'label': get_series(lx), + 'imagecut': 1, +# 'actor_photo': get_actor_photo(browser), + 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', + 'source': 'carib.py', + 'series': get_series(lx), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) + return js + except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - dic = { - 'title': get_title(lx), - 'studio': '加勒比', - 'year': get_year(lx), - 'outline': get_outline(lx), - 'runtime': get_runtime(lx), - 'director': '', - 'actor': get_actor(lx), - 'release': get_release(lx), - 'number': number, - 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', - 'tag': get_tag(lx), - 'extrafanart': get_extrafanart(lx), - 'label': get_series(lx), - 'imagecut': 1, -# 'actor_photo': get_actor_photo(browser), - 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', - 'source': 'carib.py', - 'series': get_series(lx), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) - return js + def get_title(lx: html.HtmlElement) -> str: return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip() @@ -114,11 +116,10 @@ def get_actor_photo(browser): if pos<0: continue css = html[pos:pos+100] - p0 = css.find('background: url(') - p1 = css.find('.jpg)') - if p0<0 or p1<0: + cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) + if not cssBGjpgs or not len(cssBGjpgs[0]): continue - p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])} + p = {k: urljoin(browser.url, cssBGjpgs[0])} o.update(p) return o diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py index 066e04f..d22cdb1 100644 --- a/WebCrawler/dlsite.py +++ b/WebCrawler/dlsite.py @@ -153,7 +153,7 @@ def main(number): js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index e6ae516..0a51fdc 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -93,10 +93,11 @@ def main(number): actor = '素人' lx = etree.fromstring(htmlcode2, etree.HTMLParser()) cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']") + cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover) dic = { 'title': lx.xpath('/html/head/title/text()')[0], 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), + 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'outline': '', # getOutline_fc2com(htmlcode2), 'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]), 'director': getStudio_fc2com(htmlcode2), @@ -116,7 +117,7 @@ def main(number): 'series': '', } except Exception as e: - if ADC_function.config.Config().debug(): + if ADC_function.config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -124,4 +125,5 @@ def main(number): if __name__ == '__main__': print(main('FC2-1787685')) + print(main('FC2-2086710')) diff --git a/WebCrawler/fc2club.py b/WebCrawler/fc2club.py index 7d0fac6..df14b3b 100644 --- a/WebCrawler/fc2club.py +++ b/WebCrawler/fc2club.py @@ -84,7 +84,7 @@ def main(number): dic = { 'title': getTitle_fc2com(htmlcode2), 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), + 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'outline': '', # getOutline_fc2com(htmlcode2), 'runtime': '', 'director': getStudio_fc2com(htmlcode2), @@ -103,7 +103,7 @@ def main(number): 'series': '', } except Exception as e: - if ADC_function.config.Config().debug(): + if ADC_function.config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 7446ef3..1af4359 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -6,8 +6,7 @@ from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * -from WebCrawler import fanza -from WebCrawler import airav +import inspect def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img soup = BeautifulSoup(htmlcode, 'lxml') @@ -82,12 +81,16 @@ def getCID(htmlcode): result = re.sub('/.*?.jpg','',string) return result def getOutline(number): #获取剧情介绍 + if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): + return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 try: - response = json.loads(airav.main(number)) - result = response['outline'] + htmlcode = get_html('https://cn.airav.wiki/video/' + number) + from WebCrawler.airav import getOutline as airav_getOutline + result = airav_getOutline(htmlcode) return result except: - return '' + pass + return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 @@ -117,13 +120,15 @@ def getExtrafanart(htmlcode): # 获取剧照 extrafanart_pather = re.compile(r'404 Page Not Found" in htmlcode: + raise Exception('404 page not found') dic = { 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), 'studio': getStudio(htmlcode), @@ -155,6 +160,8 @@ def main(number): htmlcode = get_html('https://www.fanbus.us/' + number) except: htmlcode = get_html('https://www.javbus.com/' + number) + if "404 Page Not Found" in htmlcode: + raise Exception('404 page not found') dic = { 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'studio': getStudio(htmlcode), @@ -180,7 +187,7 @@ def main(number): except: return main_uncensored(number) except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", @@ -191,5 +198,7 @@ def main(number): return js if __name__ == "__main__" : + print(main('ADV-R0624')) # 404 print(main('ipx-292')) print(main('CEMD-011')) + print(main('CJOD-278')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index ecc4f36..756be1c 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -5,7 +5,7 @@ from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * -from WebCrawler import airav +import secrets # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) @@ -21,7 +21,7 @@ def getActor(a): genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') r = [] idx = 0 - actor_gendor = config.Config().actor_gender() + actor_gendor = config.getInstance().actor_gender() if not actor_gendor in ['female','male','both','all']: actor_gendor = 'female' for act in actors: @@ -67,9 +67,15 @@ def getStudio(a): patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>') pianshang = patherr.findall(a) if pianshang: - result = pianshang[0] - else: - result = "" + result = pianshang[0].strip() + if len(result): + return result + # 以卖家作为工作室 + html = etree.fromstring(a, etree.HTMLParser()) + try: + result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") + except: + result = '' return result def getRuntime(a): @@ -171,16 +177,13 @@ def getTrailer(htmlcode): # 获取预告片 return video_url def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - return extrafanart_imgs - return '' - + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = [] + try: + result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href") + except: + pass + return result def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) try: @@ -195,11 +198,13 @@ def getDirector(a): return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getOutline(number): #获取剧情介绍 try: - response = json.loads(airav.main(number)) - result = response['outline'] + htmlcode = get_html('https://cn.airav.wiki/video/' + number) + from WebCrawler.airav import getOutline as airav_getOutline + result = airav_getOutline(htmlcode) return result except: - return '' + pass + return '' def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -208,7 +213,7 @@ def getSeries(a): return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number): - javdb_site = random.choice(["javdb9", "javdb30"]) + javdb_site = secrets.choice(["javdb9", "javdb30"]) try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # pass @@ -303,8 +308,16 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b 'series': getSeries(detail_page), } + if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): + dic['actor'].append('素人') + if not dic['series']: + dic['series'] = dic['studio'] + if not dic['label']: + dic['label'] = dic['studio'] + + except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -316,7 +329,9 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) - print(main('FC2-735670')) - print(main('FC2-1174949')) # not found + print(main('093021_539')) # 没有剧照 片商pacopacomama + # print(main('FC2-2278260')) + # print(main('FC2-735670')) + # print(main('FC2-1174949')) # not found print(main('MVSD-439')) - print(main('EHM0001')) # not found + # print(main('EHM0001')) # not found diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py index 59f4572..8f58cb6 100644 --- a/WebCrawler/mgstage.py +++ b/WebCrawler/mgstage.py @@ -137,7 +137,7 @@ def main(number2): 'series': getSeries(a), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index a7b4cff..858dd54 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -224,7 +224,7 @@ def main(number): 'series': getSeries(detail_page), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""}