diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py index 9a134b0..baac89b 100644 --- a/WebCrawler/fanza.py +++ b/WebCrawler/fanza.py @@ -123,10 +123,24 @@ def getTag(text): result = html.xpath( "//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()" ) + total = [] + for i in result: + try: + total.append(translateTag_to_sc(i)) + except: + pass + return total except: result = html.xpath( "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" ) + total = [] + for i in result: + try: + total.append(translateTag_to_sc(i)) + except: + pass + return total return result diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index fe955b6..344d9d5 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -8,7 +8,6 @@ import json from ADC_function import * from WebCrawler import fanza - def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img soup = BeautifulSoup(htmlcode, 'lxml') a = soup.find_all(attrs={'class': 'star-name'}) @@ -32,10 +31,10 @@ def getTitle(htmlcode): #获取标题 def getStudio(htmlcode): #获取厂商 已修改 html = etree.fromstring(htmlcode,etree.HTMLParser()) # 如果记录中冇导演,厂商排在第4位 - if 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): + if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") # 如果记录中有导演,厂商排在第5位 - elif 'メーカー:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): + elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") else: result = '' @@ -69,7 +68,7 @@ def getNum(htmlcode): #获取番号 return result def getDirector(htmlcode): #获取导演 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) - if '監督:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): + if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") else: result = '' # 记录中有可能没有导演数据 @@ -90,10 +89,10 @@ def getOutline(htmlcode): #获取演员 def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 - if 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): + if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") # 如果记录中有导演,系列排在第7位 - elif 'シリーズ:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): + elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") else: result = '' @@ -105,10 +104,9 @@ def getTag(htmlcode): # 获取标签 for i in a: if 'onmouseout' in str(i): continue - tag.append(i.get_text()) + tag.append(translateTag_to_sc(i.get_text())) return tag - def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number) if getTitle(htmlcode) == '': @@ -143,7 +141,7 @@ def main_uncensored(number): def main(number): try: try: - htmlcode = get_html('https://www.javbus.com/ja/' + number) + htmlcode = get_html('https://www.javbus.com/' + number) try: dww_htmlcode = fanza.main_htmlcode(getCID(htmlcode)) except: @@ -163,7 +161,7 @@ def main(number): 'tag': getTag(htmlcode), 'label': getSerise(htmlcode), 'actor_photo': getActorPhoto(htmlcode), - 'website': 'https://www.javbus.com/ja/' + number, + 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), } diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index f8b1fdb..eda8cb6 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -60,10 +60,23 @@ def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') - return result + total = [] + for i in result: + try: + total.append(translateTag_to_sc(i)) + except: + pass + return total + except: result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') - return result + total = [] + for i in result: + try: + total.append(translateTag_to_sc(i)) + except: + pass + return total def getCover_small(a, index=0): # same issue mentioned below, @@ -162,4 +175,4 @@ def main(number): # main('DV-1562') # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") if __name__ == "__main__": - print(main('GS-351')) + print(main('ipx-292')) diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py index 337af23..089bbf2 100644 --- a/WebCrawler/mgstage.py +++ b/WebCrawler/mgstage.py @@ -64,7 +64,14 @@ def getTag(a): '\\n') result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip( '\\n') - return str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') + result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') + total = [] + for i in result: + try: + total.append(translateTag_to_sc(i)) + except: + pass + return total def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']") diff --git a/config.ini b/config.ini index 1f391a7..e8fbd0b 100644 --- a/config.ini +++ b/config.ini @@ -5,6 +5,7 @@ success_output_folder=JAV_output soft_link=0 failed_move=1 auto_exit=0 +transalte_to_sc=1 [proxy] ;proxytype: http or socks5 or socks5h diff --git a/config.py b/config.py index 6389444..77865ef 100644 --- a/config.py +++ b/config.py @@ -33,6 +33,8 @@ class Config: return self.conf.getboolean("common", "failed_move") def auto_exit(self) -> bool: return self.conf.getboolean("common", "auto_exit") + def transalte_to_sc(self) -> bool: + return self.conf.getboolean("common", "transalte_to_sc") def proxy(self) -> [str, int, int, str]: try: @@ -87,13 +89,14 @@ class Config: conf.set(sec1, "soft_link", "0") conf.set(sec1, "failed_move", "1") conf.set(sec1, "auto_exit", "0") + conf.set(sec1, "transalte_to_sc", "1") sec2 = "proxy" conf.add_section(sec2) conf.set(sec2, "proxy", "127.0.0.1:1080") conf.set(sec2, "timeout", "10") conf.set(sec2, "retry", "3") - conf.set(sec2, "type", "http") + conf.set(sec2, "type", "socks5") sec3 = "Name_Rule" conf.add_section(sec3) diff --git a/core.py b/core.py index 0ae674d..81f2433 100755 --- a/core.py +++ b/core.py @@ -484,13 +484,8 @@ def get_part(filepath, failed_folder): def debug_print(data: json): try: print("[+] ---Debug info---") - for i, v in data.items(): - if i == "outline": - print("[+] -", i, " :", len(v), "characters") - continue - if i == "actor_photo" or i == "year": - continue - print("[+] -", "%-11s" % i, ":", v) + + print("[+] ---Debug info---") except: pass