From e6870357227a3e4b2940a5021c617f512a2a0b5e Mon Sep 17 00:00:00 2001 From: root Date: Fri, 14 Aug 2020 17:00:31 +0800 Subject: [PATCH] Update 3.7-5 DEBUG ONLY --- AV_Data_Capture.py | 41 ++++++++------- WebCrawler/__init__.py | 0 WebCrawler/avsox.py | 2 + WebCrawler/dlsite.py | 102 +++++++++++++++++++++++++------------ WebCrawler/fanza.py | 2 + WebCrawler/fc2fans_club.py | 2 + WebCrawler/jav321.py | 2 + WebCrawler/javbus.py | 5 +- WebCrawler/javdb.py | 2 + WebCrawler/javlib.py | 2 + WebCrawler/mgstage.py | 2 + WebCrawler/xcity.py | 2 + config.ini | 2 +- core.py | 20 ++++---- 14 files changed, 122 insertions(+), 64 deletions(-) create mode 100644 WebCrawler/__init__.py diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 8d9d793..1346630 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -63,31 +63,32 @@ def CEF(path): a = '' -def create_data_and_move(file_path: str, c: config.Config): +def create_data_and_move(file_path: str, c: config.Config,debug): # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 n_number = get_number(file_path) - # print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number)) - # core_main(file_path, n_number, c) - # print("[*]======================================================") - - try: + if debug == True: print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number)) core_main(file_path, n_number, c) print("[*]======================================================") - except Exception as err: - print("[-] [{}] ERROR:".format(file_path)) - print('[-]', err) + else: + try: + print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number)) + core_main(file_path, n_number, c) + print("[*]======================================================") + except Exception as err: + print("[-] [{}] ERROR:".format(file_path)) + print('[-]', err) - if c.soft_link(): - print("[-]Link {} to failed folder".format(file_path)) - os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/") - else: - try: - print("[-]Move [{}] to failed folder".format(file_path)) - shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/") - except Exception as err: - print('[!]', err) + if c.soft_link(): + print("[-]Link {} to failed folder".format(file_path)) + os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/") + else: + try: + print("[-]Move [{}] to failed folder".format(file_path)) + shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/") + except Exception as err: + print('[!]', err) def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number=None): try: @@ -145,13 +146,15 @@ if __name__ == '__main__': count = 0 count_all = str(len(movie_list)) print('[+]Find', count_all, 'movies') + if conf.debug() == True: + print('[+]'+' DEBUG MODE ON '.center(54, '-')) if conf.soft_link(): print('[!] --- Soft link mode is ENABLE! ----') for movie_path in movie_list: # 遍历电影列表 交给core处理 count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -') - create_data_and_move(movie_path, conf) + create_data_and_move(movie_path, conf, conf.debug()) CEF(conf.success_folder()) CEF(conf.failed_folder()) diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index c3d0b6a..8aec076 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import re from lxml import etree import json diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py index f4d1501..f5ba5aa 100644 --- a/WebCrawler/dlsite.py +++ b/WebCrawler/dlsite.py @@ -2,6 +2,8 @@ import re from lxml import etree import json from bs4 import BeautifulSoup +import sys +sys.path.append('../') from ADC_function import * # import sys # import io @@ -24,7 +26,10 @@ def getTitle(a): return result def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()') + try: + result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()') + except: + result1 = '' return result1 def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img a = actor.split(',') @@ -35,7 +40,13 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img return d def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0] + try: + try: + result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] + except: + result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0] + except: + result = '' return result def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -44,7 +55,13 @@ def getRuntime(a): return str(result1 + result2).strip('+').rstrip('mi') def getLabel(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0] + try: + try: + result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] + except: + result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0] + except: + result = '' return result def getYear(getRelease): try: @@ -54,12 +71,12 @@ def getYear(getRelease): return getRelease def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0] + result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0] return result1.replace('年','-').replace('月','-').replace('日','') def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: - result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()') + result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()') return result except: return '' @@ -85,7 +102,10 @@ def getCover(htmlcode): return result def getDirector(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0] + try: + result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0] + except: + result = '' return result def getOutline(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) @@ -96,36 +116,52 @@ def getOutline(htmlcode): return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") def getSeries(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()') - return result1 + try: + try: + result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] + except: + result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0] + except: + result = '' + return result def main(number): - number = number.upper() - htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html') + try: + number = number.upper() + htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', + cookies={'locale': 'zh-cn'}) - dic = { - 'actor': getActor(htmlcode), - 'title': getTitle(htmlcode), - 'studio': getStudio(htmlcode), - 'outline': getOutline(htmlcode), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'release': getRelease(htmlcode), - 'number': number, - 'cover': 'https:' + getCover(htmlcode), - 'cover_small': '', - 'imagecut': 0, - 'tag': getTag(htmlcode), - 'label': getLabel(htmlcode), - 'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': '', - 'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', - 'source': 'dlsite.py', - 'series': getSeries(htmlcode), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') - return js + dic = { + 'actor': getActor(htmlcode), + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'outline': getOutline(htmlcode), + 'runtime': '', + 'director': getDirector(htmlcode), + 'release': getRelease(htmlcode), + 'number': number, + 'cover': 'https:' + getCover(htmlcode), + 'cover_small': '', + 'imagecut': 0, + 'tag': getTag(htmlcode), + 'label': getLabel(htmlcode), + 'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), + 'actor_photo': '', + 'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', + 'source': 'dlsite.py', + 'series': getSeries(htmlcode), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + return js + except: + data = { + "title": "", + } + js = json.dumps( + data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":") + ) + return js # main('DV-1562') # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") if __name__ == "__main__": - print(main('VJ013479')) + print(main('VJ013178')) diff --git a/WebCrawler/fanza.py b/WebCrawler/fanza.py index 71aab6a..9a134b0 100644 --- a/WebCrawler/fanza.py +++ b/WebCrawler/fanza.py @@ -1,5 +1,7 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- +import sys +sys.path.append('../') import json import re from urllib.parse import urlencode diff --git a/WebCrawler/fc2fans_club.py b/WebCrawler/fc2fans_club.py index 2c31a51..a072a78 100644 --- a/WebCrawler/fc2fans_club.py +++ b/WebCrawler/fc2fans_club.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import re from lxml import etree#need install import json diff --git a/WebCrawler/jav321.py b/WebCrawler/jav321.py index 7b0baae..2d0b0b0 100644 --- a/WebCrawler/jav321.py +++ b/WebCrawler/jav321.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import json from bs4 import BeautifulSoup from lxml import html diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 7d51a4d..42446e2 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -1,10 +1,13 @@ +import sys +sys.path.append('../') import re from pyquery import PyQuery as pq#need install from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * -import fanza +from WebCrawler import fanza + def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img soup = BeautifulSoup(htmlcode, 'lxml') diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index b1656d0..362ab94 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import re from lxml import etree import json diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py index cb6f78b..ff2c22d 100644 --- a/WebCrawler/javlib.py +++ b/WebCrawler/javlib.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import json import bs4 from bs4 import BeautifulSoup diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py index 2c6391b..337af23 100644 --- a/WebCrawler/mgstage.py +++ b/WebCrawler/mgstage.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import re from lxml import etree import json diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index fda4f2c..21a1389 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -1,3 +1,5 @@ +import sys +sys.path.append('../') import re from lxml import etree import json diff --git a/config.ini b/config.ini index 97ec400..aa28c5b 100644 --- a/config.ini +++ b/config.ini @@ -26,4 +26,4 @@ literals=\()/ folders=failed,JAV_output [debug_mode] -switch=0 \ No newline at end of file +switch=1 \ No newline at end of file diff --git a/core.py b/core.py index ca1019c..4a7bfbd 100755 --- a/core.py +++ b/core.py @@ -8,16 +8,16 @@ from PIL import Image from ADC_function import * # =========website======== -import avsox -import fanza -import fc2fans_club -import jav321 -import javbus -import javdb -import mgstage -import xcity -import javlib -import dlsite +from WebCrawler import avsox +from WebCrawler import fanza +from WebCrawler import fc2fans_club +from WebCrawler import jav321 +from WebCrawler import javbus +from WebCrawler import javdb +from WebCrawler import mgstage +from WebCrawler import xcity +from WebCrawler import javlib +from WebCrawler import dlsite def escape_path(path, escape_literals: str): # Remove escape literals