diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 42446e2..fe955b6 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -29,9 +29,16 @@ def getTitle(htmlcode): #获取标题 return title2 except: return title -def getStudio(htmlcode): #获取厂商 +def getStudio(htmlcode): #获取厂商 已修改 html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + # 如果记录中冇导演,厂商排在第4位 + if 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + # 如果记录中有导演,厂商排在第5位 + elif 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + else: + result = '' return result def getYear(htmlcode): #获取年份 html = etree.fromstring(htmlcode,etree.HTMLParser()) @@ -45,10 +52,10 @@ def getRelease(htmlcode): #获取出版日期 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") return result -def getRuntime(htmlcode): #获取分钟 - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find(text=re.compile('分鐘')) - return a +def getRuntime(htmlcode): #获取分钟 已修改 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") + return result def getActor(htmlcode): #获取女优 b=[] soup=BeautifulSoup(htmlcode,'lxml') @@ -60,9 +67,12 @@ def getNum(htmlcode): #获取番号 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") return result -def getDirector(htmlcode): #获取导演 +def getDirector(htmlcode): #获取导演 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + if '監督:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + else: + result = '' # 记录中有可能没有导演数据 return result def getCID(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) @@ -77,14 +87,18 @@ def getOutline(htmlcode): #获取演员 return result except: return '' -def getSerise(htmlcode): - try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getSerise(htmlcode): #获取系列 已修改 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + # 如果记录中冇导演,系列排在第6位 + if 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") + # 如果记录中有导演,系列排在第7位 + elif 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - return result - except: - return '' -def getTag(htmlcode): # 获取演员 + else: + result = '' + return result +def getTag(htmlcode): # 获取标签 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') a = soup.find_all(attrs={'class': 'genre'}) @@ -94,10 +108,11 @@ def getTag(htmlcode): # 获取演员 tag.append(i.get_text()) return tag + def main_uncensored(number): - htmlcode = get_html('https://www.javbus.com/' + number) + htmlcode = get_html('https://www.javbus.com/ja/' + number) if getTitle(htmlcode) == '': - htmlcode = get_html('https://www.javbus.com/' + number.replace('-','_')) + htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: @@ -117,7 +132,7 @@ def main_uncensored(number): 'label': getSerise(htmlcode), 'imagecut': 0, 'actor_photo': '', - 'website': 'https://www.javbus.com/' + number, + 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } @@ -128,7 +143,7 @@ def main_uncensored(number): def main(number): try: try: - htmlcode = get_html('https://www.javbus.com/' + number) + htmlcode = get_html('https://www.javbus.com/ja/' + number) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: @@ -148,7 +163,7 @@ def main(number): 'tag': getTag(htmlcode), 'label': getSerise(htmlcode), 'actor_photo': getActorPhoto(htmlcode), - 'website': 'https://www.javbus.com/' + number, + 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } diff --git a/core.py b/core.py index 49bf590..9ccdbd9 100755 --- a/core.py +++ b/core.py @@ -152,6 +152,43 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON cover_small = tmpArr[0].strip('\"').strip('\'') # ====================处理异常字符 END================== #\/:*?"<>| + # === 替换Studio片假名 + studio = studio.replace('アイエナジー','Energy') + studio = studio.replace('アイデアポケット','Idea Pocket') + studio = studio.replace('アキノリ','AKNR') + studio = studio.replace('アタッカーズ','Attackers') + studio = re.sub('アパッチ.*','Apache',studio) + studio = studio.replace('アマチュアインディーズ','SOD') + studio = studio.replace('アリスJAPAN','Alice Japan') + studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex') + studio = studio.replace('クリスタル映像','Crystal 映像') + studio = studio.replace('グローリークエスト','Glory Quest') + studio = studio.replace('ダスッ!','DAS!') + studio = studio.replace('ディープス','DEEP’s') + studio = studio.replace('ドグマ','Dogma') + studio = studio.replace('プレステージ','PRESTIGE') + studio = studio.replace('ムーディーズ','MOODYZ') + studio = studio.replace('メディアステーション','宇宙企画') + studio = studio.replace('ワンズファクトリー','WANZ FACTORY') + studio = studio.replace('エスワン ナンバーワンスタイル','S1') + studio = studio.replace('エスワンナンバーワンスタイル','S1') + studio = studio.replace('SODクリエイト','SOD') + studio = studio.replace('サディスティックヴィレッジ','SOD') + studio = studio.replace('V&Rプロダクツ','V&R PRODUCE') + studio = studio.replace('V&RPRODUCE','V&R PRODUCE') + studio = studio.replace('レアルワークス','Real Works') + studio = studio.replace('マックスエー','MAX-A') + studio = studio.replace('ピーターズMAX','PETERS MAX') + studio = studio.replace('プレミアム','PREMIUM') + studio = studio.replace('ナチュラルハイ','NATURAL HIGH') + studio = studio.replace('マキシング','MAXING') + studio = studio.replace('エムズビデオグループ','M’s Video Group') + studio = studio.replace('ミニマム','Minimum') + studio = studio.replace('ワープエンタテインメント','WAAP Entertainment') + studio = re.sub('.*/妄想族','妄想族',studio) + studio = studio.replace('/',' ') + # === 替换Studio片假名 END + location_rule = eval(conf.location_rule()) # Process only Windows. @@ -357,7 +394,7 @@ def cutImage(imagecut, path, number, c_word): imgSize = img.size w = img.width h = img.height - img2 = img.crop((w / 1.9, 0, w, h)) + img2 = img.crop((w - h / 1.5, 0, w, h)) img2.save(path + '/' + number + c_word + '-poster.jpg') print('[+]Image Cutted! ' + path + '/' + number + c_word + '-poster.jpg') except: