diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index 8e9f5cb..add509c 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -225,8 +225,8 @@ def main(number): if __name__ == '__main__': #print(main('ADN-188')) - print(search('ADN-188')) - print(search('012717_472')) - print(search('080719-976')) - print(search('姫川ゆうな')) + print(main('ADN-188')) + print(main('012717_472')) + print(main('080719-976')) + print(main('姫川ゆうな')) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index dc3743e..d9acbf2 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup#need install import json from ADC_function import * from WebCrawler import fanza +import airav def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img soup = BeautifulSoup(htmlcode, 'lxml') @@ -79,12 +80,13 @@ def getCID(htmlcode): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result -def getOutline(htmlcode): #获取演员 - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getOutline(number): #获取演员 try: - result = html.xpath("string(//div[contains(@class,'mg-b20 lh4')])").replace('\n','') + response = json.loads(airav.main(number)) + result = response['outline'] return result - except: + except Exception as e: + print(e) return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) @@ -122,15 +124,11 @@ def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number) if getTitle(htmlcode) == '': htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) - try: - dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) - except: - dww_htmlcode = '' dic = { 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), - 'outline': getOutline(dww_htmlcode), + 'outline': getOutline(number), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), @@ -157,15 +155,11 @@ def main(number): htmlcode = get_html('https://www.fanbus.us/' + number) except: htmlcode = get_html('https://www.javbus.com/' + number) - try: - dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) - except: - dww_htmlcode = '' dic = { 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'studio': getStudio(htmlcode), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), - 'outline': getOutline(dww_htmlcode), + 'outline': getOutline(number), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py index 1e1ba0d..90c4575 100644 --- a/WebCrawler/javlib.py +++ b/WebCrawler/javlib.py @@ -3,6 +3,7 @@ sys.path.append('../') import json import bs4 import re +import airav from bs4 import BeautifulSoup from lxml import html from http.cookies import SimpleCookie @@ -42,7 +43,7 @@ def main(number: str): "title": get_title(lx, soup), "studio": get_table_el_single_anchor(soup, "video_maker"), "year": get_table_el_td(soup, "video_date")[:4], - "outline": "", + "outline": get_outline(number), "director": get_table_el_single_anchor(soup, "video_director"), "cover": get_cover(lx), "imagecut": 1, @@ -77,7 +78,7 @@ def main(number: str): "title": get_title(lx, soup), "studio": get_table_el_single_anchor(soup, "video_maker"), "year": get_table_el_td(soup, "video_date")[:4], - "outline": "", + "outline": get_outline(number), "director": get_table_el_single_anchor(soup, "video_director"), "cover": get_cover(lx), "imagecut": 1, @@ -102,6 +103,15 @@ def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: return lx.xpath(xpath)[0].strip() +def get_outline(number): + try: + response = json.loads(airav.main(number)) + result = response['outline'] + return result + except: + return '' + + def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str: tag = soup.find(id=tag_id).find("a") @@ -145,7 +155,7 @@ def get_cover(lx: html.HtmlComment) -> str: if __name__ == "__main__": - lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"] + lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"] #lists = ["DVMC-003"] for num in lists: print(main(num))