diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index a117059..8ee81c2 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -86,7 +86,7 @@ def create_data_and_move(file_path: str, c: config.Config): if __name__ == '__main__': - version = '3.4.3' + version = '3.5' # Parse command line args single_file_path, config_file, auto_exit = argparse_function() diff --git a/avsox.py b/avsox.py index e54d8d1..c3d0b6a 100644 --- a/avsox.py +++ b/avsox.py @@ -72,6 +72,13 @@ def getTag(a): # 获取演员 for i in a: d.append(i.get_text()) return d +def getSeries(htmlcode): + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") + return result1 + except: + return '' def main(number): a = get_html('https://avsox.host/cn/search/' + number) @@ -108,8 +115,10 @@ def main(number): 'actor_photo': getActorPhoto(web), 'website': result1, 'source': 'avsox.py', + 'series': getSeries(info), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js -#print(main('012717_472')) \ No newline at end of file +if __name__ == "__main__": + print(main('012717_472')) \ No newline at end of file diff --git a/core.py b/core.py index b1a4725..bfb1a42 100755 --- a/core.py +++ b/core.py @@ -98,6 +98,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON runtime = json_data['runtime'] outline = json_data['outline'] label = json_data['label'] + series = json_data['series'] year = json_data['year'] try: cover_small = json_data['cover_small'] @@ -166,7 +167,8 @@ def get_info(json_data): # 返回json里的数据 number = json_data['number'] cover = json_data['cover'] website = json_data['website'] - return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website + series = json_data['series'] + return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series def small_cover_check(path, number, cover_small, c_word, conf: config.Config, filepath, failed_folder): @@ -263,7 +265,7 @@ def image_download(cover, number, c_word, path, conf: config.Config, filepath, f def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, failed_folder, tag, actor_list, liuchu): - title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website = get_info(json_data) + title, studio, year, outline, runtime, director, actor_photo, release, number, cover, website, series = get_info(json_data) try: if not os.path.exists(path): @@ -300,6 +302,7 @@ def print_files(path, c_word, naming_rule, part, cn_sub, json_data, filepath, fa try: for i in tag: print(" " + i + "", file=code) + print(" " + series + "", file=code) except: aaaaa = '' try: diff --git a/fanza.py b/fanza.py index fe4c7f4..d33abc7 100644 --- a/fanza.py +++ b/fanza.py @@ -108,7 +108,7 @@ def getRelease(text): )[0].lstrip("\n") except: pass - return result + return result.replace('/','-') def getTag(text): @@ -174,6 +174,23 @@ def getOutline(text): return result +def getSeries(text): + try: + html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + try: + result = html.xpath( + "//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()" + )[0] + except: + result = html.xpath( + "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()" + )[0] + return result + except: + return '' + + + def main(number): # fanza allow letter + number + underscore, normalize the input here # @note: I only find the usage of underscore as h_test123456789 @@ -225,6 +242,7 @@ def main(number): "actor_photo": "", "website": chosen_url, "source": "fanza.py", + "series": getSeries(htmlcode), } except: data = { @@ -266,7 +284,4 @@ def main_htmlcode(number): if __name__ == "__main__": - # print(main("DV-1562")) - # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") - # print(main("ipx292")) - pass + print(main("DV-1562")) \ No newline at end of file diff --git a/fc2fans_club.py b/fc2fans_club.py index d24cb3f..2cf5c1e 100755 --- a/fc2fans_club.py +++ b/fc2fans_club.py @@ -159,5 +159,5 @@ def main(number): js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') return js - -#print(main('1252953')) +if __name__ == '__main__': + print(main('1252953')) \ No newline at end of file diff --git a/jav321.py b/jav321.py index 9e0665c..7b0baae 100644 --- a/jav321.py +++ b/jav321.py @@ -13,7 +13,6 @@ def main(number: str) -> json: data = parse_info(soup) dic = { "title": get_title(lx), - "studio": "", "year": get_year(data), "outline": get_outline(lx), "director": "", @@ -46,10 +45,12 @@ def parse_info(soup: BeautifulSoup) -> dict: return { "actor": get_actor(data_dic), "label": get_label(data_dic), + "studio": get_studio(data_dic), "tag": get_tag(data_dic), "number": get_number(data_dic), "release": get_release(data_dic), "runtime": get_runtime(data_dic), + "series": get_series(data_dic), } else: return {} @@ -84,6 +85,9 @@ def get_cover(lx: html.HtmlElement) -> str: def get_outline(lx: html.HtmlElement) -> str: return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0] +def get_series2(lx: html.HtmlElement) -> str: + return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0] + def get_actor(data: hash) -> str: if "女优" in data: @@ -106,6 +110,13 @@ def get_tag(data: hash) -> str: return "" +def get_studio(data: hash) -> str: + if "片商" in data: + return get_anchor_info(data["片商"]) + else: + return "" + + def get_number(data: hash) -> str: if "番号" in data: return get_text_info(data["番号"]) @@ -134,5 +145,12 @@ def get_year(data: hash) -> str: return "" +def get_series(data: hash) -> str: + if "系列" in data: + return get_anchor_info(data["系列"]) + else: + return "" + + if __name__ == "__main__": - print(main("wmc-002")) + print(main("soe-259")) diff --git a/javbus.py b/javbus.py index 9f77a25..7d51a4d 100755 --- a/javbus.py +++ b/javbus.py @@ -75,9 +75,12 @@ def getOutline(htmlcode): #获取演员 except: return '' def getSerise(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - return result + try: + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") + return result + except: + return '' def getTag(htmlcode): # 获取演员 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') @@ -113,6 +116,7 @@ def main_uncensored(number): 'actor_photo': '', 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', + 'series': getSerise(htmlcode), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js @@ -143,6 +147,7 @@ def main(number): 'actor_photo': getActorPhoto(htmlcode), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', + 'series': getSerise(htmlcode), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -157,3 +162,6 @@ def main(number): data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") ) return js + +if __name__ == "__main__" : + print(main('ipx-292')) diff --git a/javdb.py b/javdb.py index 31a4e63..ce19601 100755 --- a/javdb.py +++ b/javdb.py @@ -13,8 +13,8 @@ def getTitle(a): return result def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"演員")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img a = actor.split(',') @@ -25,23 +25,23 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img return d def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時長")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') def getLabel(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"系列")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getNum(a): html = etree.fromstring(a, etree.HTMLParser()) - result1 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"番號")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') def getYear(getRelease): try: @@ -51,14 +51,18 @@ def getYear(getRelease): return getRelease def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"時間")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+') def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"类别")]/../following-sibling::span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',') + try: + result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') + return result + except: + result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') + return result + def getCover_small(a, index=0): # same issue mentioned below, # javdb sometime returns multiple results @@ -74,17 +78,26 @@ def getCover(htmlcode): return result def getDirector(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/text()')).strip(" ['']") - result2 = str(html.xpath('//strong[contains(text(),"導演")]/../following-sibling::span/a/text()')).strip(" ['']") + result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getOutline(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") return result +def getSeries(a): + #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") + result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number): try: number = number.upper() - query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') + try: + query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') + except: + query_result = get_html('https://javdb4.com/search?q=' + number + '&f=all') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for @@ -111,6 +124,7 @@ def main(number): 'actor_photo': getActorPhoto(getActor(detail_page)), 'website': 'https://javdb.com' + correct_url, 'source': 'javdb.py', + 'series': getSeries(detail_page), } except Exception as e: # print(e) @@ -120,4 +134,5 @@ def main(number): # main('DV-1562') # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") -#print(main('ipx-292')) +if __name__ == "__main__": + print(main('ipx-292')) diff --git a/mgstage.py b/mgstage.py index 9685619..2c6391b 100755 --- a/mgstage.py +++ b/mgstage.py @@ -21,8 +21,8 @@ def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/t return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',') def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text() - result1=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') - result2=str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n') + result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n') return str(result1+result2).strip('+').replace("', '",'').replace('"','') def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -55,14 +55,14 @@ def getRelease(a): '\\n') result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( '\\n') - return str(result1 + result2).strip('+') + return str(result1 + result2).strip('+').replace('/','-') def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( '\\n') result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( '\\n') - return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','') + return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") @@ -79,6 +79,13 @@ def getOutline(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '') return result +def getSeries(a): + html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( + '\\n') + return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number2): number=number2.upper() htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) @@ -103,6 +110,7 @@ def main(number2): 'actor_photo': '', 'website':'https://www.mgstage.com/product/product_detail/'+str(number)+'/', 'source': 'mgstage.py', + 'series': getSeries(a), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js diff --git a/xcity.py b/xcity.py index 3ea001d..0a12ad8 100644 --- a/xcity.py +++ b/xcity.py @@ -70,7 +70,7 @@ def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')).strip(" ['']") try: - return re.findall('\d{4}/\d{2}/\d{2}', result1)[0] + return re.findall('\d{4}/\d{2}/\d{2}', result1)[0].replace('/','-') except: return '' @@ -122,8 +122,7 @@ def main(number): try: number = number.upper() query_result = get_html( - 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-', - '') + '&sg=main&num=30') + 'https://xcity.jp/result_published/?genre=%2Fresult_published%2F&q=' + number.replace('-','') + '&sg=main&num=30') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() urls = html.xpath("//table[contains(@class, 'resultList')]/tr[2]/td[1]/a/@href")[0] detail_page = get_html('https://xcity.jp' + urls)