diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 42446e2..fe955b6 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -29,9 +29,16 @@ def getTitle(htmlcode): #获取标题 return title2 except: return title -def getStudio(htmlcode): #获取厂商 +def getStudio(htmlcode): #获取厂商 已修改 html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + # 如果记录中冇导演,厂商排在第4位 + if 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + # 如果记录中有导演,厂商排在第5位 + elif 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + else: + result = '' return result def getYear(htmlcode): #获取年份 html = etree.fromstring(htmlcode,etree.HTMLParser()) @@ -45,10 +52,10 @@ def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") return result -def getRuntime(htmlcode): #获取分钟 - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find(text=re.compile('分鐘')) - return a +def getRuntime(htmlcode): #获取分钟 已修改 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") + return result def getActor(htmlcode): #获取女优 b=[] soup=BeautifulSoup(htmlcode,'lxml') @@ -60,9 +67,12 @@ def getNum(htmlcode): #获取番号 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") return result -def getDirector(htmlcode): #获取导演 +def getDirector(htmlcode): #获取导演 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + if '監督:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + else: + result = '' # 记录中有可能没有导演数据 return result def getCID(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) @@ -77,14 +87,18 @@ def getOutline(htmlcode): #获取演员 return result except: return '' -def getSerise(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getSerise(htmlcode): #获取系列 已修改 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + # 如果记录中冇导演,系列排在第6位 + if 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") + # 如果记录中有导演,系列排在第7位 + elif 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - return result - except: - return '' -def getTag(htmlcode): # 获取演员 + else: + result = '' + return result +def getTag(htmlcode): # 获取标签 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') a = soup.find_all(attrs={'class': 'genre'}) @@ -94,10 +108,11 @@ def getTag(htmlcode): # 获取演员 tag.append(i.get_text()) return tag + def main_uncensored(number): - htmlcode = get_html('https://www.javbus.com/' + number) + htmlcode = get_html('https://www.javbus.com/ja/' + number) if getTitle(htmlcode) == '': - htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) + htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: @@ -117,7 +132,7 @@ def main_uncensored(number): 'label': getSerise(htmlcode), 'imagecut': 0, 'actor_photo': '', - 'website': 'https://www.javbus.com/' + number, + 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } @@ -128,7 +143,7 @@ def main_uncensored(number): def main(number): try: try: - htmlcode = get_html('https://www.javbus.com/' + number) + htmlcode = get_html('https://www.javbus.com/ja/' + number) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: @@ -148,7 +163,7 @@ def main(number): 'tag': getTag(htmlcode), 'label': getSerise(htmlcode), 'actor_photo': getActorPhoto(htmlcode), - 'website': 'https://www.javbus.com/' + number, + 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), }