From f21fdcb5f5a5ec242b31f1130d71d54072c2985b Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 3 Oct 2021 09:53:09 +0800 Subject: [PATCH 01/56] log dir adapts to makedirs(), fix CmdLine output --- AV_Data_Capture.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index c1c7ee4..5d3263f 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -59,6 +59,7 @@ class OutLogger(object): def __init__(self, logfile) -> None: self.term = sys.stdout self.log = open(logfile,"w",encoding='utf-8',buffering=1) + self.filepath = logfile def __del__(self): self.close() def __enter__(self): @@ -85,6 +86,7 @@ class ErrLogger(OutLogger): def __init__(self, logfile) -> None: self.term = sys.stderr self.log = open(logfile,"w",encoding='utf-8',buffering=1) + self.filepath = logfile def close(self): if self.term != None: sys.stderr = self.term @@ -97,10 +99,15 @@ class ErrLogger(OutLogger): def dupe_stdout_to_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0: return - if not os.path.isdir(logdir): - os.makedirs(logdir) + if not os.path.exists(logdir): + try: + os.makedirs(logdir) + except: + pass if not os.path.isdir(logdir): return + elif not os.path.isdir(logdir): + return log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt') @@ -113,8 +120,16 @@ def dupe_stdout_to_logfile(logdir: str): def close_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): return + #日志关闭前保存日志文件路径 + filepath = '' + try: + filepath = sys.stdout.filepath + except: + pass sys.stdout.close() sys.stderr.close() + if len(filepath): + print("Log file '{}' saved.".format(filepath)) # 清理空文件 for current_dir, subdirs, files in os.walk(logdir, topdown=False): try: @@ -304,7 +319,8 @@ if __name__ == '__main__': print('[+]Enable debug') if conf.soft_link(): print('[!]Enable soft link') - #print('[!]CmdLine:'," ".join(sys.argv[1:])) + if len(sys.argv)>1: + print('[!]CmdLine:'," ".join(sys.argv[1:])) create_failed_folder(conf.failed_folder()) start_time = time.time() @@ -353,9 +369,10 @@ if __name__ == '__main__': " End at", time.strftime("%Y-%m-%d %H:%M:%S")) print("[+]All finished!!!") - if not (conf.auto_exit() or auto_exit): - input("Press enter key exit, you can check the error message before you exit...") close_logfile(logdir) + if not (conf.auto_exit() or auto_exit): + input("Press enter key exit, you can check the error message before you exit...") + sys.exit(0) From f52db0011c222ae6c543b11588eab34fb1a34e4a Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 3 Oct 2021 10:21:47 +0800 Subject: [PATCH 02/56] optimize if logic --- AV_Data_Capture.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 5d3263f..1f2df85 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -104,9 +104,7 @@ def dupe_stdout_to_logfile(logdir: str): os.makedirs(logdir) except: pass - if not os.path.isdir(logdir): - return - elif not os.path.isdir(logdir): + if not os.path.isdir(logdir): return log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") From 8ef87c285fbbfd615dac9a5efa5c5ec927bc8ff6 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 3 Oct 2021 10:43:54 +0800 Subject: [PATCH 03/56] =?UTF-8?q?=E5=86=8D=E5=B0=86=E5=85=B6=E5=AE=83?= =?UTF-8?q?=E5=87=A0=E4=B8=AAmakedirs()=E4=B8=80=E8=B5=B7=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3=EF=BC=8C=E5=8E=BB=E6=8E=89=E9=94=99=E4=B8=8A=E5=8A=A0?= =?UTF-8?q?=E9=94=99=E7=9A=84=E6=8F=90=E5=8D=87=E5=88=B0admin=E5=BB=BA?= =?UTF-8?q?=E8=AE=AE=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 6 ++---- core.py | 40 ++++++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 1f2df85..766d826 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -214,13 +214,11 @@ def movie_lists(root, conf, regexstr): def create_failed_folder(failed_folder): - if not os.path.isdir(failed_folder): # 新建failed文件夹 + if not os.path.exists(failed_folder): # 新建failed文件夹 try: os.makedirs(failed_folder) - if not os.path.isdir(failed_folder): - raise except: - print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)") + print(f"[-]Fatal error! Can not make folder '{failed_folder}'") sys.exit(0) diff --git a/core.py b/core.py index cb1a782..90da00c 100755 --- a/core.py +++ b/core.py @@ -83,17 +83,19 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹 location_rule = location_rule.replace(title, shorttitle) path = os.path.join(success_folder, location_rule).strip() - if not os.path.isdir(path): + if not os.path.exists(path): path = escape_path(path, conf.escape_literals()) try: os.makedirs(path) - if not os.path.isdir(path): - raise except: path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number") path = escape_path(path, conf.escape_literals()) + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + sys.exit(0) - os.makedirs(path) return path @@ -106,10 +108,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa for i in range(configProxy.retry): try: if configProxy.enable: - if not os.path.isdir(path): - os.makedirs(path) - if not os.path.isdir(path): - raise IOError + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + sys.exit(0) proxies = configProxy.proxies() headers = { 'User-Agent': G_USER_AGENT} @@ -121,10 +125,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa code.write(r.content) return else: - if not os.path.isdir(path): - os.makedirs(path) - if not os.path.isdir(path): - raise IOError + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + sys.exit(0) headers = { 'User-Agent': G_USER_AGENT} r = requests.get(url, timeout=configProxy.timeout, headers=headers) @@ -224,10 +230,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f else: nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo") try: - if not os.path.isdir(path): - os.makedirs(path) - if not os.path.isdir(path): - raise IOError + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! can not make folder '{path}'") + sys.exit(0) with open(nfo_path, "wt", encoding='UTF-8') as code: print('', file=code) print("", file=code) From 6d1e99d8ab98ac991d601c2708fe14b4267772e1 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 3 Oct 2021 10:44:57 +0800 Subject: [PATCH 04/56] fix issue 603 --- core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core.py b/core.py index 90da00c..7794b9e 100755 --- a/core.py +++ b/core.py @@ -96,7 +96,7 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹 print(f"[-]Fatal error! Can not make folder '{path}'") sys.exit(0) - return path + return os.path.normcase(path) # =====================资源下载部分=========================== From 952e2c9a30377b677ec9167041d3d51c727065f0 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 3 Oct 2021 10:59:25 +0800 Subject: [PATCH 05/56] =?UTF-8?q?=E6=89=80=E6=9C=89makedirs()=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E5=81=9A=E7=9B=B8=E5=90=8C=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 766d826..257d9b0 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -103,7 +103,8 @@ def dupe_stdout_to_logfile(logdir: str): try: os.makedirs(logdir) except: - pass + print(f"[-]Fatal error! Can not make log folder '{logdir}'") + sys.exit(0) if not os.path.isdir(logdir): return From 5df03392793ff5873427f044261c2a162c1f3219 Mon Sep 17 00:00:00 2001 From: lededev Date: Mon, 4 Oct 2021 23:57:16 +0800 Subject: [PATCH 06/56] =?UTF-8?q?=E7=94=A8normpath()=E6=89=8D=E8=83=BD?= =?UTF-8?q?=E7=BB=B4=E6=8C=81=E5=8E=9F=E6=9D=A5=E7=9A=84=E5=A4=A7=E5=B0=8F?= =?UTF-8?q?=E5=86=99=EF=BC=8Cnormcase()=E4=BC=9A=E5=85=A8=E9=83=A8?= =?UTF-8?q?=E5=8F=98=E4=B8=BA=E5=B0=8F=E5=86=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core.py b/core.py index 7794b9e..3ca9eb2 100755 --- a/core.py +++ b/core.py @@ -96,7 +96,7 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹 print(f"[-]Fatal error! Can not make folder '{path}'") sys.exit(0) - return os.path.normcase(path) + return os.path.normpath(path) # =====================资源下载部分=========================== From 3183d284b78c8d281129813d5f0da3f7c9083276 Mon Sep 17 00:00:00 2001 From: lededev Date: Fri, 8 Oct 2021 08:33:03 +0800 Subject: [PATCH 07/56] number_parser.py:add more studio, unit test, full disk search as unit test --- number_parser.py | 173 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 54 deletions(-) diff --git a/number_parser.py b/number_parser.py index 2d1874e..616af85 100755 --- a/number_parser.py +++ b/number_parser.py @@ -1,14 +1,13 @@ import os import re -from core import * - +import sys G_spat = re.compile( "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@", re.IGNORECASE) -def get_number(debug,filepath: str) -> str: +def get_number(debug,file_path: str) -> str: # """ # >>> from number_parser import get_number # >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4") @@ -32,77 +31,143 @@ def get_number(debug,filepath: str) -> str: # >>> get_number("snis-829-C.mp4") # 'snis-829' # """ - filepath = os.path.basename(filepath) - - if debug == False: - try: - if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 - #filepath = filepath.replace("_", "-") - filepath = G_spat.sub("", filepath) - filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 - lower_check = filename.lower() - if 'fc2' in lower_check: - filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() - file_number = get_number_by_dict(lower_check) - if file_number: - return file_number - return str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) - else: # 提取不含减号-的番号,FANZA CID - # 欧美番号匹配规则 - oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) - if oumei: - return oumei.group() - - try: - return str( - re.findall(r'(.+?)\.', - str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( - "['']").replace('_', '-') - except: - return re.search(r'(.+?)\.', filepath)[0] - except Exception as e: - print('[-]' + str(e)) - return - elif debug == True: - if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 - #filepath = filepath.replace("_", "-") + filepath = os.path.basename(file_path) + # debug True 和 False 两块代码块合并,原因是此模块及函数只涉及字符串计算,没有IO操作,debug on时输出导致异常信息即可 + try: + file_number = get_number_by_dict(filepath) + if file_number: + return file_number + elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 filepath = G_spat.sub("", filepath) filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 lower_check = filename.lower() if 'fc2' in lower_check: filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() - file_number = get_number_by_dict(lower_check) - if file_number: - return file_number return str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) else: # 提取不含减号-的番号,FANZA CID # 欧美番号匹配规则 oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) if oumei: return oumei.group() - try: return str( re.findall(r'(.+?)\.', - str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( + str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( "['']").replace('_', '-') except: - return re.search(r'(.+?)\.', filepath)[0] + return str(re.search(r'(.+?)\.', filepath)[0]) + except Exception as e: + if debug: + print(f'[-]Number Parser exception: {e} [{file_path}]') + return None + +# 按javdb数据源的命名规范提取number G_TAKE_NUM_RULES = { - 'tokyo' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.A).group()), - 'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('_', '-'), - '1pon' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('-', '_'), - '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.A).group()).replace('-', '_'), - 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.A).group()) - } + 'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()), + 'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'), + '1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'), + '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'), + 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()), + 'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]), + 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[-|_]{1}(\d{3,4})[^\d]*', x, re.I)[0]) +} -def get_number_by_dict(lower_filename: str) -> str: - for k,v in G_TAKE_NUM_RULES.items(): - if k in lower_filename: - return v(lower_filename) +def get_number_by_dict(filename: str) -> str: + try: + for k,v in G_TAKE_NUM_RULES.items(): + if re.search(k, filename, re.I): + return v(filename) + except: + pass return None -# if __name__ == "__main__": +if __name__ == "__main__": # import doctest # doctest.testmod(raise_on_error=True) + test_use_cases = ( + "Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取 + "TokyoHot-n1287-HD SP2006 .mp4", + "caribean-020317_001.nfo", # -号误命名为_号的 + "257138_3xplanet_1Pondo_080521_001.mp4", + "ADV-R0624-CD3.wmv", # 多碟影片 + "XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源 + "xxx-av 20589.mp4", + "Muramura-102114_145-HD.wmv", # 新支持片商格式 102114_145 命名规则来自javdb数据源 + "heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源 + "HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源 + "pacopacomama-093021_539-FHD.mkv" # 新支持片商格式 093021_539 命名规则来自javdb数据源 + ) + def evprint(evstr): + code = compile(evstr, "", "eval") + print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code))) + for t in test_use_cases: + evprint(f'get_number(True, "{t}")') + + if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE): + sys.exit(0) + + # 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据,参数为盘符 A .. Z 或带盘符路径 + # https://www.voidtools.com/support/everything/command_line_interface/ + # ES命令行工具需要Everything文件搜索引擎处于运行状态,es.exe单个执行文件需放入PATH路径中。 + # Everything是免费软件 + # 示例: + # python.exe .\number_parser.py ALL # 从所有磁盘搜索视频 + # python.exe .\number_parser.py D # 从D盘搜索 + # python.exe .\number_parser.py D: # 同上 + # python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录,路径必须带盘符 + # ================== + # Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据 + # 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引 + # MAC OS X 使用findutils的glocate,需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引 + # 示例: + # python3 ./number_parser.py ALL + import subprocess + ES_search_path = "ALL disks" + if sys.argv[1] == "ALL": + if sys.platform == "win32": + # ES_prog_path = 'C:/greensoft/es/es.exe' + ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内 + ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v' + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失 + out_list = out_text.splitlines() + elif sys.platform in ("linux", "darwin"): + ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate' + ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path) + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('utf-8') + out_list = [ os.path.basename(line) for line in out_text.splitlines()] + else: + print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.') + sys.exit(1) + else: # Windows single disk + if sys.platform != "win32": + print('[!]Usage: python3 ./number_parser.py ALL') + sys.exit(0) + # ES_prog_path = 'C:/greensoft/es/es.exe' + ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内 + if os.path.isdir(sys.argv[1]): + ES_search_path = sys.argv[1] + else: + ES_search_path = sys.argv[1][0] + ':/' + if not os.path.isdir(ES_search_path): + ES_search_path = 'C:/' + ES_search_path = os.path.normcase(ES_search_path) + ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v' + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失 + out_list = out_text.splitlines() + print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...') + print(f'[+]Find {len(out_list)} Movies.') + for filename in out_list: + try: + n = get_number(True, filename) + if n: + print(f' [{n}] # {filename}') + else: + print(f'[-]Number return None. # {filename}') + except Exception as e: + print(f'[-]Number Parser exception: {e} [{filename}]') + + sys.exit(0) From 39ad0257603f734a812cea0a0ad2fe487b31b29b Mon Sep 17 00:00:00 2001 From: lededev Date: Fri, 8 Oct 2021 10:22:05 +0800 Subject: [PATCH 08/56] config.py:override config settings by cmd params, pyinstaller add config.ini --- Makefile | 4 +- config.ini | 2 + config.py | 149 +++++++++++++++++++++++++++++++++++++-------- py_to_exe.ps1 | 5 +- wrapper/FreeBSD.sh | 6 +- wrapper/Linux.sh | 6 +- 6 files changed, 143 insertions(+), 29 deletions(-) diff --git a/Makefile b/Makefile index 407aa4b..4c8960a 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,9 @@ make: #export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1) @echo "[+]Pyinstaller make" - pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "Img:Img" + pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ @echo "[+]Move to bin" if [ ! -d "./bin" ];then mkdir bin; fi diff --git a/config.ini b/config.ini index 58e6892..f33a578 100755 --- a/config.ini +++ b/config.ini @@ -1,5 +1,6 @@ [common] main_mode=1 +source_folder=./ failed_output_folder=failed success_output_folder=JAV_output soft_link=0 @@ -16,6 +17,7 @@ nfo_skip_days=30 ; 处理完多少个视频文件后停止,0为处理所有视频文件 stop_counter=0 ; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁 +ignore_failed_list=0 [proxy] ;proxytype: http or socks5 or socks5h switch: 0 1 diff --git a/config.py b/config.py index 82fd345..2b49ca0 100644 --- a/config.py +++ b/config.py @@ -1,33 +1,83 @@ import os +import re import sys import configparser -import codecs from pathlib import Path + +G_conf_override = { + # index 0 save Config() first instance for quick access by using getInstance() + 0 : None, + # register override config items + "common:main_mode" : None, + "common:source_folder" : None, + "common:auto_exit" : None, + "common:nfo_skip_days" : None, + "common:stop_counter" : None, + "common:ignore_failed_list" : None, + "debug_mode:switch" : None +} + + +def getInstance(): + if isinstance(G_conf_override[0], Config): + return G_conf_override[0] + return Config() + + class Config: def __init__(self, path: str = "config.ini"): - path_search_order = [ - path, - "./config.ini", - os.path.join(Path.home(), "avdc.ini"), - os.path.join(Path.home(), ".avdc.ini"), - os.path.join(Path.home(), ".avdc/config.ini"), - os.path.join(Path.home(), ".config/avdc/config.ini") - ] + path_search_order = ( + Path(path), + Path.cwd() / "config.ini", + Path.home() / "avdc.ini", + Path.home() / ".avdc.ini", + Path.home() / ".avdc/config.ini", + Path.home() / ".config/avdc/config.ini" + ) ini_path = None for p in path_search_order: - if os.path.isfile(p): - ini_path = p + if p.is_file(): + ini_path = p.resolve() break if ini_path: self.conf = configparser.ConfigParser() + self.ini_path = ini_path try: - self.conf.read(ini_path, encoding="utf-8-sig") + if self.conf.read(ini_path, encoding="utf-8-sig"): + if G_conf_override[0] is None: + G_conf_override[0] = self except: - self.conf.read(ini_path, encoding="utf-8") + if self.conf.read(ini_path, encoding="utf-8"): + if G_conf_override[0] is None: + G_conf_override[0] = self else: - print("[-]Config file not found!") - sys.exit(2) + print("ERROR: Config file not found!") + print("Please put config file into one of the following path:") + print('\n'.join([str(p.resolve()) for p in path_search_order[2:]])) + # 对于找不到配置文件的情况,还是在打包时附上对应版本的默认配置文件,有需要时为其在搜索路径中生成, + # 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来,单个执行文件就是功能完整的了,放在任何 + # 执行路径下都可以放心使用。 + res_path = None + # pyinstaller打包的在打包中找config.ini + if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file(): + res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini' + # 脚本运行的所在位置找 + elif (Path(__file__).resolve().parent / 'config.ini').is_file(): + res_path = Path(__file__).resolve().parent / 'config.ini' + if res_path is None: + sys.exit(2) + ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:") + if re.search('n', ins, re.I): + sys.exit(2) + # 用户目录才确定具有写权限,因此选择 ~/avdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的 + # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。 + write_path = path_search_order[2] # Path.home() / "avdc.ini" + with open(write_path, 'w', encoding='utf-8') as wcfg: + wcfg.write(res_path.read_text(encoding='utf-8')) + print("Config file '{}' created.".format(write_path.resolve())) + input("Press Enter key exit...") + sys.exit(0) # self.conf = self._default_config() # try: # self.conf = configparser.ConfigParser() @@ -40,13 +90,24 @@ class Config: # print("[-]",e) # sys.exit(3) # #self.conf = self._default_config() + def getboolean_override(self, section, item) -> bool: + return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"]) - def main_mode(self) -> str: + def getint_override(self, section, item) -> int: + return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"]) + + def get_override(self, section, item) -> str: + return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"]) + + def main_mode(self) -> int: try: - return self.conf.getint("common", "main_mode") + return self.getint_override("common", "main_mode") except ValueError: self._exit("common:main_mode") + def source_folder(self) -> str: + return self.get_override("common", "source_folder") + def failed_folder(self) -> str: return self.conf.get("common", "failed_output_folder") @@ -61,7 +122,7 @@ class Config: def failed_move(self) -> bool: return self.conf.getboolean("common", "failed_move") def auto_exit(self) -> bool: - return self.conf.getboolean("common", "auto_exit") + return self.getboolean_override("common", "auto_exit") def transalte_to_sc(self) -> bool: return self.conf.getboolean("common", "transalte_to_sc") def multi_threading(self) -> bool: @@ -70,14 +131,16 @@ class Config: return self.conf.getboolean("common", "del_empty_folder") def nfo_skip_days(self) -> int: try: - return self.conf.getint("common", "nfo_skip_days") + return self.getint_override("common", "nfo_skip_days") except: return 30 def stop_counter(self) -> int: try: - return self.conf.getint("common", "stop_counter") + return self.getint_override("common", "stop_counter") except: return 0 + def ignore_failed_list(self) -> bool: + return self.getboolean_override("common", "ignore_failed_list") def is_transalte(self) -> bool: return self.conf.getboolean("transalte", "switch") def is_trailer(self) -> bool: @@ -173,7 +236,7 @@ class Config: return self.conf.get("escape", "folders") def debug(self) -> bool: - return self.conf.getboolean("debug_mode", "switch") + return self.getboolean_override("debug_mode", "switch") @staticmethod def _exit(sec: str) -> None: @@ -188,6 +251,7 @@ class Config: sec1 = "common" conf.add_section(sec1) conf.set(sec1, "main_mode", "1") + conf.set(sec1, "source_folder", "./") conf.set(sec1, "failed_output_folder", "failed") conf.set(sec1, "success_output_folder", "JAV_output") conf.set(sec1, "soft_link", "0") @@ -199,6 +263,7 @@ class Config: conf.set(sec1, "del_empty_folder", "1") conf.set(sec1, "nfo_skip_days", 30) conf.set(sec1, "stop_counter", 0) + conf.set(sec1, "ignore_failed_list", 0) sec2 = "proxy" conf.add_section(sec2) @@ -308,9 +373,45 @@ if __name__ == "__main__": code = compile(evstr, "", "eval") print('{}: "{}"'.format(evstr, eval(code))) config = Config() - mfilter = ('conf', 'proxy', '_exit', '_default_config') + mfilter = ('conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path') for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]: evprint(f'config.{_m}()') pfilter = ('proxies', 'SUPPORT_PROXY_TYPE') - for _p in [p for p in dir(config.proxy()) if not p.startswith('__') and p not in pfilter]: - evprint(f'config.proxy().{_p}') + # test getInstance() + assert(getInstance() == config) + for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]: + evprint(f'getInstance().proxy().{_p}') + + # Override Test + G_conf_override["common:nfo_skip_days"] = 4321 + G_conf_override["common:stop_counter"] = 1234 + assert config.nfo_skip_days() == 4321 + assert getInstance().stop_counter() == 1234 + # remove override + G_conf_override["common:stop_counter"] = None + G_conf_override["common:nfo_skip_days"] = None + assert config.nfo_skip_days() != 4321 + assert config.stop_counter() != 1234 + # Create new instance + conf2 = Config() + assert getInstance() != conf2 + assert getInstance() == config + G_conf_override["common:main_mode"] = 9 + G_conf_override["common:source_folder"] = "A:/b/c" + # Override effect to all instances + assert config.main_mode() == 9 + assert conf2.main_mode() == 9 + assert getInstance().main_mode() == 9 + assert conf2.source_folder() == "A:/b/c" + print("### Override Test ###".center(36)) + evprint('getInstance().main_mode()') + evprint('config.source_folder()') + G_conf_override["common:main_mode"] = None + evprint('conf2.main_mode()') + evprint('config.main_mode()') + # unregister key acess will raise except + try: + print(G_conf_override["common:actor_gender"]) + except KeyError as ke: + print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr) + print(f"Load Config file '{conf2.ini_path}'.") diff --git a/py_to_exe.ps1 b/py_to_exe.ps1 index 7fc0f80..77f169a 100644 --- a/py_to_exe.ps1 +++ b/py_to_exe.ps1 @@ -3,14 +3,15 @@ $CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1) -mkdir build +mkdir build mkdir __pycache__ pyinstaller --onefile AV_Data_Capture.py ` --hidden-import ADC_function.py ` --hidden-import core.py ` --add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` - --add-data "Img;Img" + --add-data "Img;Img" ` + --add-data "config.ini;." ` rmdir -Recurse -Force build rmdir -Recurse -Force __pycache__ diff --git a/wrapper/FreeBSD.sh b/wrapper/FreeBSD.sh index 70f27d7..9717ef4 100755 --- a/wrapper/FreeBSD.sh +++ b/wrapper/FreeBSD.sh @@ -1,4 +1,8 @@ pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448 pip install pyquery pyinstaller -pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img" +pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + cp config.ini ./dist diff --git a/wrapper/Linux.sh b/wrapper/Linux.sh index 1d05e6a..63e3b1c 100755 --- a/wrapper/Linux.sh +++ b/wrapper/Linux.sh @@ -12,5 +12,9 @@ #fi pip3 install -r requirements.txt pip3 install cloudscraper==1.2.52 -pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img" +pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + cp config.ini ./dist From 8cb57673b05afb8522ecd090d25556f96d057246 Mon Sep 17 00:00:00 2001 From: lededev Date: Fri, 8 Oct 2021 11:15:30 +0800 Subject: [PATCH 09/56] log auto merge --- AV_Data_Capture.py | 96 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 18 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 257d9b0..19a3212 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -99,18 +99,18 @@ class ErrLogger(OutLogger): def dupe_stdout_to_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0: return - if not os.path.exists(logdir): + log_dir = Path(logdir) + if not log_dir.exists(): try: - os.makedirs(logdir) + log_dir.mkdir(parents=True,exist_ok=True) except: - print(f"[-]Fatal error! Can not make log folder '{logdir}'") - sys.exit(0) - if not os.path.isdir(logdir): - return - + pass + if not log_dir.is_dir(): + return # Tips for disabling logs by change directory to a same name empty regular file + abslog_dir = log_dir.resolve() log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") - logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt') - errlog = os.path.join(logdir, f'avdc_{log_tmstr}_err.txt') + logfile = abslog_dir / f'avdc_{log_tmstr}.txt' + errlog = abslog_dir / f'avdc_{log_tmstr}_err.txt' sys.stdout = OutLogger(logfile) sys.stderr = ErrLogger(errlog) @@ -119,25 +119,85 @@ def dupe_stdout_to_logfile(logdir: str): def close_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): return - #日志关闭前保存日志文件路径 - filepath = '' + #日志关闭前保存日志路径 + filepath = None try: filepath = sys.stdout.filepath except: pass sys.stdout.close() sys.stderr.close() - if len(filepath): - print("Log file '{}' saved.".format(filepath)) + log_dir = Path(logdir).resolve() + if isinstance(filepath, Path): + print(f"Log file '{filepath}' saved.") + assert(filepath.parent.samefile(log_dir)) # 清理空文件 - for current_dir, subdirs, files in os.walk(logdir, topdown=False): + for f in log_dir.glob(r'*_err.txt'): + if f.stat().st_size == 0: + try: + f.unlink(missing_ok=True) + except: + pass + # 合并日志 只检测日志目录内的文本日志,忽略子目录。三个月前的日志,按月合并为一个月志, + # 去年及以前的月志,今年4月以后将之按年合并为年志 + # 测试步骤: + """ + LOGDIR=/tmp/avlog + mkdir -p $LOGDIR + for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done + for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done + echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR" + # 1932 files in /tmp/avlog + avdc -zgic1 -d0 -m3 -o $LOGDIR + # python3 ./AV_Data_Capture.py -zgic1 -o $LOGDIR + ls $LOGDIR + # rm -rf $LOGDIR + """ + # 第一步,合并到月 + for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法 + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{8}T\d{6}', f.stem, re.A)] + if not txts or not len(txts): + break + txts.sort() + today = datetime.today() + tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32T") + deadline_month = f'avdc_{tmstr_3_month_ago}' + month_merge = [f for f in txts if f.stem < deadline_month] + if not month_merge or not len(month_merge): + break + tomonth = len('01T235959.txt') # cut length avdc_202012|01T235959.txt + for f in month_merge: + try: + month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt + with open(month_file_name, 'a', encoding='utf-8') as m: + m.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第二步,月合并到年 + if today.month < 4: + return + mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{6}', f.stem, re.A)] + if not mons or not len(mons): + return + mons.sort() + deadline_year = f'avdc_{today.year-1}13' + year_merge = [f for f in mons if f.stem < deadline_year] + if not year_merge or not len(year_merge): + return + toyear = len('12.txt') # cut length avdc_2020|12.txt + for f in year_merge: try: - for f in files: - full_name = os.path.join(current_dir, f) - if os.path.getsize(full_name) == 0: - os.remove(full_name) + year_file_name = str(f)[:-toyear] + '.txt' # avdc_2020.txt + with open(year_file_name, 'a', encoding='utf-8') as y: + y.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) except: pass + # 第三步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于 + # 这种粒度的文本日志,压缩比是目前最好的。lzip -9的运行参数下,日志压缩比要高于xz -9,而且内存占用更少, + # 多核利用率更高(plzip多线程版本),解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右, + # 100MB的日志文件能缩小到3.7MB。 # 重写视频文件扫描,消除递归,取消全局变量,新增失败文件列表跳过处理 From cf072e79d1c352e1c271bf98923a4d7905f60ca0 Mon Sep 17 00:00:00 2001 From: lededev Date: Fri, 8 Oct 2021 11:29:47 +0800 Subject: [PATCH 10/56] =?UTF-8?q?=E8=BE=93=E5=87=BA=E6=8E=92=E7=89=88?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=8Cnumber=E6=94=BE=E5=9C=A8=E5=B7=A6?= =?UTF-8?q?=E8=BE=B9=E5=9B=BA=E5=AE=9A=E4=BD=8D=E7=BD=AE=EF=BC=8C=E4=B8=8A?= =?UTF-8?q?=E4=B8=80=E8=A1=8C=E7=9A=84=E7=95=99=E7=99=BD=E4=BB=A5=E4=BE=BF?= =?UTF-8?q?=E8=BF=85=E9=80=9F=E5=AE=9A=E4=BD=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 19a3212..6ab00ad 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -306,7 +306,7 @@ def create_data_and_move(file_path: str, c: config.Config, debug): file_path = os.path.abspath(file_path) if debug == True: - print(f"[!]Making Data for [{file_path}], the number is [{n_number}]") + print(f"[!] [{n_number}] As Number making data for '{file_path}'") if n_number: core_main(file_path, n_number, c) else: @@ -314,7 +314,7 @@ def create_data_and_move(file_path: str, c: config.Config, debug): print("[*]======================================================") else: try: - print(f"[!]Making Data for [{file_path}], the number is [{n_number}]") + print(f"[!] [{n_number}] As Number making data for '{file_path}'") if n_number: core_main(file_path, n_number, c) else: @@ -333,8 +333,11 @@ def create_data_and_move(file_path: str, c: config.Config, debug): def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number): file_name = os.path.basename(file_path) try: - print("[!]Making Data for [{}], the number is [{}]".format(file_path, custom_number)) - core_main(file_path, custom_number, c) + print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number)) + if custom_number: + core_main(file_path, custom_number, c) + else: + print("[-] number empty ERROR") print("[*]======================================================") except Exception as err: print("[-] [{}] ERROR:".format(file_path)) @@ -372,6 +375,7 @@ if __name__ == '__main__': if conf.update_check(): check_update(version) + print(f"[+]Load Config file '{conf.ini_path}'.") if conf.debug(): print('[+]Enable debug') if conf.soft_link(): @@ -408,7 +412,7 @@ if __name__ == '__main__': for movie_path in movie_list: # 遍历电影列表 交给core处理 count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' - print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -') + print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) create_data_and_move(movie_path, conf, conf.debug()) if count >= stop_count: print("[!]Stop counter triggered!") From a405c5c41bf21f20ea8c8f13b5fac3a40d718c41 Mon Sep 17 00:00:00 2001 From: lededev Date: Fri, 8 Oct 2021 11:46:35 +0800 Subject: [PATCH 11/56] =?UTF-8?q?WebCrawler:=E5=85=A8=E9=9D=A2=E6=8D=A2?= =?UTF-8?q?=E8=A3=85getInstance()=EF=BC=8C=E5=8E=98=E6=B8=85airav.py?= =?UTF-8?q?=E4=B8=8Ejavbus.py=E5=8F=8Ajavdb.py=E7=9A=84=E7=9B=B8=E7=88=B1?= =?UTF-8?q?=E7=9B=B8=E6=9D=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/__init__.py | 10 +++ WebCrawler/airav.py | 194 ++++++++++++++++++++--------------------- WebCrawler/avsox.py | 8 +- WebCrawler/carib.py | 61 ++++++------- WebCrawler/dlsite.py | 2 +- WebCrawler/fc2.py | 6 +- WebCrawler/fc2club.py | 4 +- WebCrawler/javbus.py | 23 +++-- WebCrawler/javdb.py | 61 ++++++++----- WebCrawler/mgstage.py | 2 +- WebCrawler/xcity.py | 2 +- 11 files changed, 206 insertions(+), 167 deletions(-) diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index e1608b6..dc54b46 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -134,6 +134,14 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 print('[-]Movie Number not found!') return None + # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障 + # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z + # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z + # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。 + if str(json_data.get('number')).upper() != file_number.upper(): + print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number')))) + return None + # ================================================网站规则添加结束================================================ title = json_data.get('title') @@ -225,6 +233,8 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 studio = studio.replace('エムズビデオグループ','M’s Video Group') studio = studio.replace('ミニマム','Minimum') studio = studio.replace('ワープエンタテインメント','WAAP Entertainment') + studio = studio.replace('pacopacomama,パコパコママ','pacopacomama') + studio = studio.replace('パコパコママ','pacopacomama') studio = re.sub('.*/妄想族','妄想族',studio) studio = studio.replace('/',' ') # === 替换Studio片假名 END diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index 5925421..f7b144c 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -6,6 +6,7 @@ from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * +from WebCrawler import javbus ''' API @@ -17,95 +18,94 @@ API host = 'https://www.airav.wiki' # airav这个网站没有演员图片,所以直接使用javbus的图 -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) - d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=urljoin("https://www.javbus.com", - str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) - p2={t:p} - d.update(p2) - return d +def getActorPhoto(javbus_json): + result = javbus_json.get('actor_photo') + if isinstance(result, dict) and len(result): + return result + return '' def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - # h5:first-child定位第一个h5标签,妈的找了好久才找到这个语法 - title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-') - try: - title2 = re.sub('n\d+-','',title) + html = etree.fromstring(htmlcode, etree.HTMLParser()) + title = str(html.xpath('/html/head/title/text()')[0]) + result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() + return result - return title2 +def getStudio(htmlcode, javbus_json): #获取厂商 已修改 + # javbus如果有数据以它为准 + result = javbus_json.get('studio') + if isinstance(result, str) and len(result): + return result + html = etree.fromstring(htmlcode,etree.HTMLParser()) + return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']") +def getYear(htmlcode, javbus_json): #获取年份 + result = javbus_json.get('year') + if isinstance(result, str) and len(result): + return result + release = getRelease(htmlcode, javbus_json) + if len(release) != len('2000-01-01'): + return '' + return release[:4] +def getCover(htmlcode, javbus_json): #获取封面图片 + result = javbus_json.get('cover') + if isinstance(result, str) and len(result): + return result + html = etree.fromstring(htmlcode, etree.HTMLParser()) + return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0] +def getRelease(htmlcode, javbus_json): #获取出版日期 + result = javbus_json.get('release') + if isinstance(result, str) and len(result): + return result + html = etree.fromstring(htmlcode, etree.HTMLParser()) + try: + result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group() except: - return title - -def getStudio(htmlcode): #获取厂商 已修改 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - # 如果记录中冇导演,厂商排在第4位 - if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - # 如果记录中有导演,厂商排在第5位 - elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - else: - result = '' + return '' return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return urljoin("https://www.javbus.com", image.attr('href')) -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") - return result -def getActor(htmlcode): #获取女优 +def getRuntime(javbus_json): #获取播放时长 + result = javbus_json.get('runtime') + if isinstance(result, str) and len(result): + return result + return '' +# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先 +def getActor(htmlcode, javbus_json): #获取女优 b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) - return b -def getNum(htmlcode): #获取番号 html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 已修改 + a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()') + for v in a: + v = v.strip() + if len(v): + b.append(v) + if len(b): + return b + result = javbus_json.get('actor') + if isinstance(result, list) and len(result): + return result + return [] +def getNum(htmlcode, javbus_json): #获取番号 + result = javbus_json.get('number') + if isinstance(result, str) and len(result): + return result html = etree.fromstring(htmlcode, etree.HTMLParser()) - if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - else: - result = '' # 记录中有可能没有导演数据 + title = str(html.xpath('/html/head/title/text()')[0]) + result = str(re.findall('^\[(.*?)]', title)[0]) return result - -def getOutline(htmlcode): #获取演员 +def getDirector(javbus_json): #获取导演 已修改 + result = javbus_json.get('director') + if isinstance(result, str) and len(result): + return result + return '' +def getOutline(htmlcode): #获取概述 html = etree.fromstring(htmlcode, etree.HTMLParser()) try: - result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','') + result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip() return result except: return '' -def getSerise(htmlcode): #获取系列 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - # 如果记录中冇导演,系列排在第6位 - if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") - # 如果记录中有导演,系列排在第7位 - elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - else: - result = '' - return result +def getSerise(javbus_json): #获取系列 已修改 + result = javbus_json.get('series') + if isinstance(result, str) and len(result): + return result + return '' def getTag(htmlcode): # 获取标签 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') @@ -169,52 +169,50 @@ def main(number): try: try: htmlcode = get_html('https://cn.airav.wiki/video/' + number) - javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number) - + javbus_json = json.loads(javbus.main(number)) except: print(number) dic = { # 标题可使用airav - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), - # 制作商选择使用javbus - 'studio': getStudio(javbus_htmlcode), - # 年份也是用javbus - 'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()), + 'title': getTitle(htmlcode), + # 制作商先找javbus,如果没有再找本站 + 'studio': getStudio(htmlcode, javbus_json), + # 年份先试javbus,如果没有再找本站 + 'year': getYear(htmlcode, javbus_json), # 简介 使用 airav 'outline': getOutline(htmlcode), # 使用javbus - 'runtime': getRuntime(javbus_htmlcode), + 'runtime': getRuntime(javbus_json), # 导演 使用javbus - 'director': getDirector(javbus_htmlcode), - # 作者 使用airav - 'actor': getActor(javbus_htmlcode), - # 发售日使用javbus - 'release': getRelease(javbus_htmlcode), + 'director': getDirector(javbus_json), + # 演员 先试airav + 'actor': getActor(htmlcode, javbus_json), + # 发售日先试javbus + 'release': getRelease(htmlcode, javbus_json), # 番号使用javbus - 'number': getNum(javbus_htmlcode), + 'number': getNum(htmlcode, javbus_json), # 封面链接 使用javbus - 'cover': getCover(javbus_htmlcode), + 'cover': getCover(htmlcode, javbus_json), # 剧照获取 'extrafanart': getExtrafanart(htmlcode), 'imagecut': 1, # 使用 airav 'tag': getTag(htmlcode), # 使用javbus - 'label': getSerise(javbus_htmlcode), + 'label': getSerise(javbus_json), # 妈的,airav不提供作者图片 - 'actor_photo': getActorPhoto(javbus_htmlcode), - +# 'actor_photo': getActorPhoto(javbus_json), 'website': 'https://www.airav.wiki/video/' + number, 'source': 'airav.py', # 使用javbus - 'series': getSerise(javbus_htmlcode), + 'series': getSerise(javbus_json) } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') return js except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", @@ -226,6 +224,6 @@ def main(number): if __name__ == '__main__': - #print(main('ADN-188')) - print(main('ADN-188')) - print(main('CJOD-278')) + print(main('ADV-R0624')) # javbus页面返回404, airav有数据 + print(main('ADN-188')) # 一人 + print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字 diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 254f3e8..293769a 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -100,6 +100,9 @@ def main(number): soup = BeautifulSoup(web, 'lxml') info = str(soup.find(attrs={'class': 'row movie'})) try: + new_number = getNum(info) + if new_number.upper() != number.upper(): + raise ValueError('number not found') dic = { 'actor': getActor(web), 'title': getTitle(web).strip(getNum(web)), @@ -108,7 +111,7 @@ def main(number): 'runtime': getRuntime(info), 'director': '', # 'release': getRelease(info), - 'number': getNum(info), + 'number': new_number, 'cover': getCover(web), 'cover_small': getCover_small(a), 'imagecut': 3, @@ -121,7 +124,7 @@ def main(number): 'series': getSeries(info), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -129,3 +132,4 @@ def main(number): if __name__ == "__main__": print(main('012717_472')) + print(main('1')) # got fake result raise 'number not found' diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index 8eee1af..c1a25d9 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -1,51 +1,53 @@ import sys sys.path.append('../') import json -from bs4 import BeautifulSoup from lxml import html import re from ADC_function import * def main(number: str) -> json: try: - caribbytes, browser = get_html_by_browser( + carib_obj, browser = get_html_by_browser( 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type="browser") - if not caribbytes or not caribbytes.ok: + if not carib_obj or not carib_obj.ok: raise ValueError("page not found") lx = html.fromstring(str(browser.page)) if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): raise ValueError("page info not found") + + dic = { + 'title': get_title(lx), + 'studio': '加勒比', + 'year': get_year(lx), + 'outline': get_outline(lx), + 'runtime': get_runtime(lx), + 'director': '', + 'actor': get_actor(lx), + 'release': get_release(lx), + 'number': number, + 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', + 'tag': get_tag(lx), + 'extrafanart': get_extrafanart(lx), + 'label': get_series(lx), + 'imagecut': 1, +# 'actor_photo': get_actor_photo(browser), + 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', + 'source': 'carib.py', + 'series': get_series(lx), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) + return js + except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - dic = { - 'title': get_title(lx), - 'studio': '加勒比', - 'year': get_year(lx), - 'outline': get_outline(lx), - 'runtime': get_runtime(lx), - 'director': '', - 'actor': get_actor(lx), - 'release': get_release(lx), - 'number': number, - 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', - 'tag': get_tag(lx), - 'extrafanart': get_extrafanart(lx), - 'label': get_series(lx), - 'imagecut': 1, -# 'actor_photo': get_actor_photo(browser), - 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', - 'source': 'carib.py', - 'series': get_series(lx), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) - return js + def get_title(lx: html.HtmlElement) -> str: return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip() @@ -114,11 +116,10 @@ def get_actor_photo(browser): if pos<0: continue css = html[pos:pos+100] - p0 = css.find('background: url(') - p1 = css.find('.jpg)') - if p0<0 or p1<0: + cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) + if not cssBGjpgs or not len(cssBGjpgs[0]): continue - p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])} + p = {k: urljoin(browser.url, cssBGjpgs[0])} o.update(p) return o diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py index 066e04f..d22cdb1 100644 --- a/WebCrawler/dlsite.py +++ b/WebCrawler/dlsite.py @@ -153,7 +153,7 @@ def main(number): js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index e6ae516..0a51fdc 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -93,10 +93,11 @@ def main(number): actor = '素人' lx = etree.fromstring(htmlcode2, etree.HTMLParser()) cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']") + cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover) dic = { 'title': lx.xpath('/html/head/title/text()')[0], 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), + 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'outline': '', # getOutline_fc2com(htmlcode2), 'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]), 'director': getStudio_fc2com(htmlcode2), @@ -116,7 +117,7 @@ def main(number): 'series': '', } except Exception as e: - if ADC_function.config.Config().debug(): + if ADC_function.config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -124,4 +125,5 @@ def main(number): if __name__ == '__main__': print(main('FC2-1787685')) + print(main('FC2-2086710')) diff --git a/WebCrawler/fc2club.py b/WebCrawler/fc2club.py index 7d0fac6..df14b3b 100644 --- a/WebCrawler/fc2club.py +++ b/WebCrawler/fc2club.py @@ -84,7 +84,7 @@ def main(number): dic = { 'title': getTitle_fc2com(htmlcode2), 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), + 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'outline': '', # getOutline_fc2com(htmlcode2), 'runtime': '', 'director': getStudio_fc2com(htmlcode2), @@ -103,7 +103,7 @@ def main(number): 'series': '', } except Exception as e: - if ADC_function.config.Config().debug(): + if ADC_function.config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 7446ef3..1af4359 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -6,8 +6,7 @@ from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * -from WebCrawler import fanza -from WebCrawler import airav +import inspect def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img soup = BeautifulSoup(htmlcode, 'lxml') @@ -82,12 +81,16 @@ def getCID(htmlcode): result = re.sub('/.*?.jpg','',string) return result def getOutline(number): #获取剧情介绍 + if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): + return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 try: - response = json.loads(airav.main(number)) - result = response['outline'] + htmlcode = get_html('https://cn.airav.wiki/video/' + number) + from WebCrawler.airav import getOutline as airav_getOutline + result = airav_getOutline(htmlcode) return result except: - return '' + pass + return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 @@ -117,13 +120,15 @@ def getExtrafanart(htmlcode): # 获取剧照 extrafanart_pather = re.compile(r'404 Page Not Found" in htmlcode: + raise Exception('404 page not found') dic = { 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), 'studio': getStudio(htmlcode), @@ -155,6 +160,8 @@ def main(number): htmlcode = get_html('https://www.fanbus.us/' + number) except: htmlcode = get_html('https://www.javbus.com/' + number) + if "404 Page Not Found" in htmlcode: + raise Exception('404 page not found') dic = { 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'studio': getStudio(htmlcode), @@ -180,7 +187,7 @@ def main(number): except: return main_uncensored(number) except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", @@ -191,5 +198,7 @@ def main(number): return js if __name__ == "__main__" : + print(main('ADV-R0624')) # 404 print(main('ipx-292')) print(main('CEMD-011')) + print(main('CJOD-278')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index ecc4f36..756be1c 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -5,7 +5,7 @@ from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * -from WebCrawler import airav +import secrets # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) @@ -21,7 +21,7 @@ def getActor(a): genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') r = [] idx = 0 - actor_gendor = config.Config().actor_gender() + actor_gendor = config.getInstance().actor_gender() if not actor_gendor in ['female','male','both','all']: actor_gendor = 'female' for act in actors: @@ -67,9 +67,15 @@ def getStudio(a): patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>') pianshang = patherr.findall(a) if pianshang: - result = pianshang[0] - else: - result = "" + result = pianshang[0].strip() + if len(result): + return result + # 以卖家作为工作室 + html = etree.fromstring(a, etree.HTMLParser()) + try: + result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") + except: + result = '' return result def getRuntime(a): @@ -171,16 +177,13 @@ def getTrailer(htmlcode): # 获取预告片 return video_url def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - return extrafanart_imgs - return '' - + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = [] + try: + result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href") + except: + pass + return result def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) try: @@ -195,11 +198,13 @@ def getDirector(a): return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getOutline(number): #获取剧情介绍 try: - response = json.loads(airav.main(number)) - result = response['outline'] + htmlcode = get_html('https://cn.airav.wiki/video/' + number) + from WebCrawler.airav import getOutline as airav_getOutline + result = airav_getOutline(htmlcode) return result except: - return '' + pass + return '' def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -208,7 +213,7 @@ def getSeries(a): return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number): - javdb_site = random.choice(["javdb9", "javdb30"]) + javdb_site = secrets.choice(["javdb9", "javdb30"]) try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # pass @@ -303,8 +308,16 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b 'series': getSeries(detail_page), } + if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): + dic['actor'].append('素人') + if not dic['series']: + dic['series'] = dic['studio'] + if not dic['label']: + dic['label'] = dic['studio'] + + except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -316,7 +329,9 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) - print(main('FC2-735670')) - print(main('FC2-1174949')) # not found + print(main('093021_539')) # 没有剧照 片商pacopacomama + # print(main('FC2-2278260')) + # print(main('FC2-735670')) + # print(main('FC2-1174949')) # not found print(main('MVSD-439')) - print(main('EHM0001')) # not found + # print(main('EHM0001')) # not found diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py index 59f4572..8f58cb6 100644 --- a/WebCrawler/mgstage.py +++ b/WebCrawler/mgstage.py @@ -137,7 +137,7 @@ def main(number2): 'series': getSeries(a), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index a7b4cff..858dd54 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -224,7 +224,7 @@ def main(number): 'series': getSeries(detail_page), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} From 40d25d23f5e87a21189b97627c11e92c8d877484 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 8 Oct 2021 12:17:12 +0800 Subject: [PATCH 12/56] =?UTF-8?q?ADC=5Ffunction.py:=E6=8D=A2=E8=A3=85getIn?= =?UTF-8?q?stance(),load=5Fcookies()=E6=94=B9=E7=94=A8pathlib?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ADC_function.py | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index b13d0b4..a11ef3b 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -2,7 +2,7 @@ from os import replace import requests import hashlib from pathlib import Path -import random +#import secrets import os.path import uuid import json @@ -24,8 +24,8 @@ G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (K # 网页请求核心 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): - verify = config.Config().cacert_file() - configProxy = config.Config().proxy() + verify = config.getInstance().cacert_file() + configProxy = config.getInstance().proxy() errors = "" if ua is None: @@ -61,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None) def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: - configProxy = config.Config().proxy() + configProxy = config.getInstance().proxy() errors = "" headers_ua = {"User-Agent": G_USER_AGENT} if headers is None: @@ -86,7 +86,7 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) - configProxy = config.Config().proxy() + configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() result = browser.open(url) @@ -107,7 +107,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) if isinstance(cookies, dict): requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) - configProxy = config.Config().proxy() + configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() result = browser.open(url) @@ -131,7 +131,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d # def get_javlib_cookie() -> [dict, str]: # import cloudscraper -# switch, proxy, timeout, retry_count, proxytype = config.Config().proxy() +# switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy() # proxies = get_proxy(proxy, proxytype) # # raw_cookie = {} @@ -158,7 +158,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d def translateTag_to_sc(tag): - tranlate_to_sc = config.Config().transalte_to_sc() + tranlate_to_sc = config.getInstance().transalte_to_sc() if tranlate_to_sc: dict_gen = {'中文字幕': '中文字幕', '高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头', @@ -506,7 +506,7 @@ def translate( ): trans_result = "" if engine == "google-free": - gsite = config.Config().get_translate_service_site() + gsite = config.getInstance().get_translate_service_site() if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite): gsite = 'translate.google.cn' url = ( @@ -521,7 +521,7 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t trans_result = trans_result.join(translate_list) # elif engine == "baidu": # url = "https://fanyi-api.baidu.com/api/trans/vip/translate" - # salt = random.randint(1, 1435660288) + # salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288) # sign = app_id + src + str(salt) + key # sign = hashlib.md5(sign.encode()).hexdigest() # url += ( @@ -564,7 +564,7 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t def is_uncensored(number): if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper(): return True - configs = config.Config().get_uncensored() + configs = config.getInstance().get_uncensored() prefix_list = str(configs).split(',') for pre in prefix_list: if pre.upper() in number.upper(): @@ -593,20 +593,20 @@ def load_cookies(filename): filename = os.path.basename(filename) if not len(filename): return None, None - path_search_order = [ - f"./{filename}", - os.path.join(Path.home(), filename), - os.path.join(Path.home(), f".avdc/{filename}"), - os.path.join(Path.home(), f".local/share/avdc/{filename}") -] + path_search_order = ( + Path.cwd() / filename, + Path.home() / filename, + Path.home() / f".avdc/{filename}", + Path.home() / f".local/share/avdc/{filename}" + ) cookies_filename = None - for p in path_search_order: - if os.path.exists(p): - cookies_filename = os.path.abspath(p) - break - if not cookies_filename: - return None, None try: + for p in path_search_order: + if p.is_file(): + cookies_filename = str(p.resolve()) + break + if not cookies_filename: + return None, None return json.load(open(cookies_filename)), cookies_filename except: return None, None From b87206870be7ee9a56726e864c4ca5b3092eeff5 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 8 Oct 2021 12:29:46 +0800 Subject: [PATCH 13/56] core.py:enhancement --- core.py | 77 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/core.py b/core.py index 3ca9eb2..264d30b 100755 --- a/core.py +++ b/core.py @@ -3,8 +3,6 @@ import os.path import pathlib import re import shutil -import platform -import errno import sys from PIL import Image @@ -33,7 +31,6 @@ def moveFailedFolder(filepath, conf): print("[-]Add to Failed List file, see '%s'" % ftxt) with open(ftxt, 'a', encoding='utf-8') as flt: flt.write(f'{filepath}\n') - flt.close() elif conf.failed_move() and not soft_link: failed_name = os.path.join(failed_folder, os.path.basename(filepath)) mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt')) @@ -41,8 +38,13 @@ def moveFailedFolder(filepath, conf): with open(mtxt, 'a', encoding='utf-8') as wwibbmt: tmstr = datetime.now().strftime("%Y-%m-%d %H:%M") wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n') - wwibbmt.close() - shutil.move(filepath, failed_name) + try: + if os.path.exists(failed_name): + print('[-]File Exists while moving to FailedFolder') + return + shutil.move(filepath, failed_name) + except: + print('[-]File Moving to FailedFolder unsuccessful!') def get_info(json_data): # 返回json里的数据 @@ -224,7 +226,6 @@ def image_download(cover, number, leak_word, c_word, path, conf: config.Config, def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf): title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) - failed_folder = conf.failed_folder() if conf.main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 nfo_path = str(Path(filepath).with_suffix('.nfo')) else: @@ -236,6 +237,10 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f except: print(f"[-]Fatal error! can not make folder '{path}'") sys.exit(0) + + # KODI内查看影片信息时找不到number,配置naming_rule=number+'#'+title虽可解决 + # 但使得标题太长,放入时常为空的outline内会更适合,软件给outline留出的显示版面也较大 + outline = f"{number}#{outline}" with open(nfo_path, "wt", encoding='UTF-8') as code: print('<?xml version="1.0" encoding="UTF-8" ?>', file=code) print("<movie>", file=code) @@ -287,7 +292,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f print(" <num>" + number + "</num>", file=code) print(" <premiered>" + release + "</premiered>", file=code) print(" <cover>" + cover + "</cover>", file=code) - if config.Config().is_trailer(): + if conf.is_trailer(): print(" <trailer>" + trailer + "</trailer>", file=code) print(" <website>" + website + "</website>", file=code) print("</movie>", file=code) @@ -405,22 +410,30 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config file_parent_origin_path = str(filepath_obj.parent) try: targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}") + # 任何情况下都不要覆盖,以免遭遇数据源或者引擎错误导致所有文件得到同一个number,逐一 + # 同名覆盖致使全部文件损失且不可追回的最坏情况 + if os.path.exists(targetpath): + raise FileExistsError('File Exists on destination path, we will never overwriting.') # 如果soft_link=1 使用软链接 if conf.soft_link() == 0: shutil.move(filepath, targetpath) elif conf.soft_link() == 1: - # 采用相对路径,以便网络访问时能正确打开视频 - filerelpath = os.path.relpath(filepath, path) - os.symlink(filerelpath, targetpath) + # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持 + # 相对路径径,改用绝对路径方式尝试建立软链接 + try: + filerelpath = os.path.relpath(filepath, path) + os.symlink(filerelpath, targetpath) + except: + os.symlink(filepath_obj.resolve(), targetpath) elif conf.soft_link() == 2: shutil.move(filepath, targetpath) # 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置 # 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪 - # 便于手工找回文件。并将软连接文件名后缀修改,以避免再次被搜刮。 + # 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改。 targetabspath = os.path.abspath(targetpath) if targetabspath != os.path.abspath(filepath): targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path) - os.symlink(targetrelpath, filepath + '#sym') + os.symlink(targetrelpath, filepath) sub_res = conf.sub_rule() for subname in sub_res: @@ -430,9 +443,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config print('[+]Sub moved!') return True - except FileExistsError: - print('[-]File Exists! Please check your movie!') - print('[-]move to the root folder of the program.') + except FileExistsError as fee: + print(f'[-]FileExistsError: {fee}') + moveFailedFolder(filepath, conf) return except PermissionError: print('[-]Error! Please run as administrator!') @@ -448,11 +461,14 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo filepath_obj = pathlib.Path(filepath) houzhui = filepath_obj.suffix file_parent_origin_path = str(filepath_obj.parent) + targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}") + if os.path.exists(targetpath): + raise FileExistsError('File Exists on destination path, we will never overwriting.') try: if conf.soft_link(): - os.symlink(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")) + os.symlink(filepath, targetpath) else: - shutil.move(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")) + shutil.move(filepath, targetpath) sub_res = conf.sub_rule() for subname in sub_res: @@ -462,9 +478,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo print('[+]Sub moved!') print('[!]Success') return True - except FileExistsError: - print('[-]File Exists! Please check your movie!') - print('[-]move to the root folder of the program.') + except FileExistsError as fee: + print(f'[-]FileExistsError: {fee}') return except PermissionError: print('[-]Error! Please run as administrator!') @@ -594,17 +609,18 @@ def core_main(file_path, number_th, conf: config.Config): # 裁剪图 cutImage(imagecut, path, number, leak_word, c_word) - # 打印文件 - print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf) - - # 移动文件 - paste_file_to_folder(filepath, path, number, leak_word, c_word, conf) - + # 添加水印 poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + # 移动电影 + paste_file_to_folder(filepath, path, number, leak_word, c_word, conf) + + # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf) + elif conf.main_mode() == 2: # 创建文件夹 path = create_folder(json_data, conf) @@ -639,11 +655,12 @@ def core_main(file_path, number_th, conf: config.Config): # 裁剪图 cutImage(imagecut, path, number, leak_word, c_word) - # 打印文件 - print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, - tag, json_data.get('actor_list'), liuchu, uncensored, conf) - + # 添加水印 poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + + # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, + tag, json_data.get('actor_list'), liuchu, uncensored, conf) From 8ab736e4fabfee5f4022170b54743543e63db8ca Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 8 Oct 2021 13:02:52 +0800 Subject: [PATCH 14/56] AV_Data_Capture.py:command params new add -m -d -c -i -g -z --- ADC_function.py | 8 -- AV_Data_Capture.py | 205 +++++++++++++++++++++++++++++---------------- 2 files changed, 135 insertions(+), 78 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index a11ef3b..e755fb5 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -622,11 +622,3 @@ def file_modification_days(filename) -> int: if days < 0: return 9999 return days - -# 检查文件是否是链接 -def is_link(filename: str): - if os.path.islink(filename): - return True # symlink - elif os.stat(filename).st_nlink > 1: - return True # hard link Linux MAC OSX Windows NTFS - return False diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 6ab00ad..9b75f50 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -11,7 +11,7 @@ import config from datetime import datetime, timedelta import time from pathlib import Path -from ADC_function import file_modification_days, get_html, is_link +from ADC_function import file_modification_days, get_html from number_parser import get_number from core import core_main, moveFailedFolder @@ -35,25 +35,48 @@ def check_update(local_version): def argparse_function(ver: str) -> typing.Tuple[str, str, bool]: - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + conf = config.getInstance() + parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.") parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.") - # parser.add_argument("-c", "--config", default='config.ini', nargs='?', help="The config file Path.") - default_logdir = os.path.join(Path.home(),'.avlogs') + parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") + parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") + # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") + default_logdir = Path.home() / '.avlogs' parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?', - help=f"""Duplicate stdout and stderr to logfiles -in logging folder, default on. -default for current user: {default_logdir} -Use --log-dir= to turn off logging feature.""") - parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number") - parser.add_argument("-a", "--auto-exit", dest='autoexit', action="store_true", - help="Auto exit after program complete") + help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. + default folder for current user: '{default_logdir}'. Change default folder to an empty file, + or use --log-dir= to turn log off.""") parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.") + parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.") + parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.") + parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format( + os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) + parser.add_argument("-a", "--auto-exit", action="store_true", + help="Auto exit after program complete") + parser.add_argument("-g","--debug", action="store_true", + help="Turn on debug mode to generate diagnostic log for issue report.") + parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true", + help="""Only show job list of files and numbers, and **NO** actual operation +is performed. It may help you correct wrong numbers before real job.""") parser.add_argument("-v", "--version", action="version", version=ver) + #ini_path args = parser.parse_args() + def get_natural_number_or_none(value): + return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None + def get_str_or_none(value): + return value if isinstance(value, str) and len(value) else None + def get_bool_or_none(value): + return True if isinstance(value, bool) and value else None + config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode) + config.G_conf_override["common:source_folder"] = get_str_or_none(args.path) + config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit) + config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days) + config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt) + config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list) + config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug) - return args.file, args.path, args.number, args.autoexit, args.logdir, args.regexstr - + return args.file, args.number, args.logdir, args.regexstr, args.zero_op class OutLogger(object): def __init__(self, logfile) -> None: @@ -200,15 +223,14 @@ def close_logfile(logdir: str): # 100MB的日志文件能缩小到3.7MB。 -# 重写视频文件扫描,消除递归,取消全局变量,新增失败文件列表跳过处理 -def movie_lists(root, conf, regexstr): - escape_folder = re.split("[,,]", conf.escape_folder()) +# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告 +def movie_lists(source_folder, regexstr): + conf = config.getInstance() main_mode = conf.main_mode() debug = conf.debug() nfo_skip_days = conf.nfo_skip_days() soft_link = conf.soft_link() - total = [] - file_type = conf.media_type().upper().split(",") + file_type = conf.media_type().lower().split(",") trailerRE = re.compile(r'-trailer\.', re.IGNORECASE) cliRE = None if isinstance(regexstr, str) and len(regexstr): @@ -216,61 +238,85 @@ def movie_lists(root, conf, regexstr): cliRE = re.compile(regexstr, re.IGNORECASE) except: pass + failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt' failed_set = set() - if main_mode == 3 or soft_link: + if (main_mode == 3 or soft_link) and not conf.ignore_failed_list(): try: - with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'r', encoding='utf-8') as flt: + with open(failed_list_txt_path, 'r', encoding='utf-8') as flt: flist = flt.read().splitlines() failed_set = set(flist) - flt.close() if len(flist) != len(failed_set): - with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'w', encoding='utf-8') as flt: - flt.writelines([line + '\n' for line in failed_set]) - flt.close() + with open(failed_list_txt_path, 'w', encoding='utf-8') as flt: + wtlines = [line + '\n' for line in failed_set] + wtlines.sort() + flt.writelines(wtlines) except: pass - for current_dir, subdirs, files in os.walk(root, topdown=False): - if len(set(current_dir.replace("\\","/").split("/")) & set(escape_folder)) > 0: + if not Path(source_folder).is_dir(): + print('[-]Source folder not found!') + return [] + total = [] + source = Path(source_folder).resolve() + skip_failed_cnt, skip_nfo_days_cnt = 0, 0 + escape_folder_set = set(re.split("[,,]", conf.escape_folder())) + for full_name in source.glob(r'**/*'): + if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set: continue - for f in files: - full_name = os.path.join(current_dir, f) - if not os.path.splitext(full_name)[1].upper() in file_type: - continue - absf = os.path.abspath(full_name) - if absf in failed_set: - if debug: - print('[!]Skip failed file:', absf) - continue - if cliRE and not cliRE.search(absf): - continue - if main_mode == 3 and nfo_skip_days > 0: - nfo = Path(absf).with_suffix('.nfo') - if file_modification_days(nfo) <= nfo_skip_days: - continue - if (main_mode == 3 or not is_link(absf)) and not trailerRE.search(f): - total.append(absf) + if not full_name.suffix.lower() in file_type: + continue + absf = str(full_name) + if absf in failed_set: + skip_failed_cnt += 1 + if debug: + print('[!]Skip failed movie:', absf) + continue + is_sym = full_name.is_symlink() + if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标 + continue # file is symlink or hardlink(Linux/NTFS/Darwin) + # 调试用0字节样本允许通过,去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB) + movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size,直接赋0跳过小视频检测 + if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120 + continue + if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name): + continue + if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days: + skip_nfo_days_cnt += 1 + if debug: + print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'") + continue + total.append(absf) + + if skip_failed_cnt: + print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.") + if skip_nfo_days_cnt: + print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") if nfo_skip_days <= 0 or not soft_link or main_mode == 3: return total # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数,跳过N天内更新过的 skip_numbers = set() - success_folder = conf.success_folder() - for current_dir, subdirs, files in os.walk(success_folder, topdown=False): - for f in files: - f_obj = Path(f) - if f_obj.suffix.lower() != '.nfo': - continue - if file_modification_days(Path(current_dir) / f_obj) > nfo_skip_days: - continue - number = get_number(False, f_obj.stem) - if number: - skip_numbers.add(number.upper()) + success_folder = Path(conf.success_folder()).resolve() + for f in success_folder.glob(r'**/*'): + if not re.match(r'\.nfo', f.suffix, re.IGNORECASE): + continue + if file_modification_days(f) > nfo_skip_days: + continue + number = get_number(False, f.stem) + if not number: + continue + skip_numbers.add(number.lower()) + rm_list = [] for f in total: n_number = get_number(False, os.path.basename(f)) - if n_number and n_number.upper() in skip_numbers: + if n_number and n_number.lower() in skip_numbers: rm_list.append(f) for f in rm_list: total.remove(f) + if debug: + print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'") + if len(rm_list): + print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.") + return total @@ -299,14 +345,18 @@ def rm_empty_folder(path): pass -def create_data_and_move(file_path: str, c: config.Config, debug): +def create_data_and_move(file_path: str, zero_op): # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 + c = config.getInstance() + debug = c.debug() file_name = os.path.basename(file_path) n_number = get_number(debug, file_name) file_path = os.path.abspath(file_path) if debug == True: print(f"[!] [{n_number}] As Number making data for '{file_path}'") + if zero_op: + return if n_number: core_main(file_path, n_number, c) else: @@ -315,6 +365,8 @@ def create_data_and_move(file_path: str, c: config.Config, debug): else: try: print(f"[!] [{n_number}] As Number making data for '{file_path}'") + if zero_op: + return if n_number: core_main(file_path, n_number, c) else: @@ -357,8 +409,17 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu if __name__ == '__main__': version = '5.0.1' urllib3.disable_warnings() #Ignore http proxy warning + + # Read config.ini first, in argparse_function() need conf.failed_folder() + conf = config.Config("config.ini") + # Parse command line args - single_file_path, folder_path, custom_number, auto_exit, logdir, regexstr = argparse_function(version) + single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version) + + main_mode = conf.main_mode() + if not main_mode in (1, 2, 3): + print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") + sys.exit(4) dupe_stdout_to_logfile(logdir) @@ -368,9 +429,8 @@ if __name__ == '__main__': print('[*]======================================================') print('[*]严禁在墙内宣传本项目') - # Read config.ini - conf = config.Config("config.ini") - + start_time = time.time() + print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S")) if conf.update_check(): check_update(version) @@ -382,9 +442,15 @@ if __name__ == '__main__': print('[!]Enable soft link') if len(sys.argv)>1: print('[!]CmdLine:'," ".join(sys.argv[1:])) + print('[+]Main Working mode ## {}: {} ## {}{}{}' + .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1], + "" if not conf.multi_threading() else ", multi_threading on", + "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}", + "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}" + ) if not single_file_path else ('-','Single File', '','','')) + ) create_failed_folder(conf.failed_folder()) - start_time = time.time() if not single_file_path == '': #Single File print('[+]==================== Single File =====================') @@ -393,32 +459,31 @@ if __name__ == '__main__': else: create_data_and_move_with_custom_number(single_file_path, conf, custom_number) else: - if folder_path == '': + folder_path = conf.source_folder() + if not isinstance(folder_path, str) or folder_path == '': folder_path = os.path.abspath(".") - movie_list = movie_lists(folder_path, conf, regexstr) + movie_list = movie_lists(folder_path, regexstr) count = 0 count_all = str(len(movie_list)) - print('[+]Find', count_all, 'movies. Start at', time.strftime("%Y-%m-%d %H:%M:%S")) - main_mode = conf.main_mode() + print('[+]Find', count_all, 'movies.') stop_count = conf.stop_counter() if stop_count<1: stop_count = 999999 else: count_all = str(min(len(movie_list), stop_count)) - if main_mode == 3: - print(f'[!]运行模式:**维护模式**,本程序将在处理{count_all}个视频文件后停止,如需后台执行自动退出请结合 -a 参数。') + for movie_path in movie_list: # 遍历电影列表 交给core处理 count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) - create_data_and_move(movie_path, conf, conf.debug()) + create_data_and_move(movie_path, zero_op) if count >= stop_count: print("[!]Stop counter triggered!") break - if conf.del_empty_folder(): + if conf.del_empty_folder() and not zero_op: rm_empty_folder(conf.success_folder()) rm_empty_folder(conf.failed_folder()) if len(folder_path): @@ -433,7 +498,7 @@ if __name__ == '__main__': close_logfile(logdir) - if not (conf.auto_exit() or auto_exit): + if not conf.auto_exit(): input("Press enter key exit, you can check the error message before you exit...") sys.exit(0) From 35c4bf85ae795785dd56490cfe78d979c2f2449a Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 8 Oct 2021 16:01:31 +0800 Subject: [PATCH 15/56] argparse:need str as default value type --- AV_Data_Capture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 9b75f50..4411538 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -42,7 +42,7 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, bool]: parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") - default_logdir = Path.home() / '.avlogs' + default_logdir = str(Path.home() / '.avlogs') parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?', help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. default folder for current user: '{default_logdir}'. Change default folder to an empty file, From 288acfb264c7ad1d1ffd3fe1d96b4071ae65b836 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 05:28:44 +0800 Subject: [PATCH 16/56] =?UTF-8?q?=E4=B8=8D=E4=BC=9A=E9=80=A0=E6=88=90bug?= =?UTF-8?q?=EF=BC=8C=E4=BD=86=E8=BF=98=E6=98=AF=E6=94=B9=E4=B8=80=E4=B8=8B?= =?UTF-8?q?=E5=A5=BD=E4=B8=80=E4=BA=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 4411538..5def067 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -177,12 +177,12 @@ def close_logfile(logdir: str): # rm -rf $LOGDIR """ # 第一步,合并到月 + today = datetime.today() for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法 txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{8}T\d{6}', f.stem, re.A)] if not txts or not len(txts): break txts.sort() - today = datetime.today() tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32T") deadline_month = f'avdc_{tmstr_3_month_ago}' month_merge = [f for f in txts if f.stem < deadline_month] From 890452bffd9a978f0dc067428cc1d6b6bdd44ab3 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 09:07:38 +0800 Subject: [PATCH 17/56] =?UTF-8?q?=E8=A1=A5=E4=B8=8A=E6=BC=8F=E6=8E=89?= =?UTF-8?q?=E6=B2=A1=E6=9B=B4=E6=96=B0=E7=9A=84config=E6=89=93=E5=8C=85?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=E9=83=A8=E5=88=86=EF=BC=8C=E5=85=88=E5=89=8D?= =?UTF-8?q?=E8=A2=AB=E6=88=91=E7=9A=84WinMerge=20filter=E8=A7=84=E5=88=99?= =?UTF-8?q?=E8=BF=87=E6=BB=A4=E6=8E=89=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/main.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6b0a748..289c88e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,7 +33,7 @@ jobs: - name: Test number_perser.get_number run: | python number_parser.py -v - + - name: Build with PyInstaller for macos/ubuntu if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' run: | @@ -42,6 +42,8 @@ jobs: --hidden-import ADC_function.py \ --hidden-import core.py \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ - name: Build with PyInstaller for windows if: matrix.os == 'windows-latest' @@ -51,6 +53,8 @@ jobs: --hidden-import ADC_function.py ` --hidden-import core.py ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` + --add-data "Img;Img" ` + --add-data "config.ini;." ` - name: Copy config.ini run: | From f60166922984c1e9f4cbc608ca43c8ba52e30d77 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 12:23:00 +0800 Subject: [PATCH 18/56] javdb:change to site 31 and 32 --- WebCrawler/javdb.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 756be1c..3a0a18d 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -213,14 +213,16 @@ def getSeries(a): return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number): - javdb_site = secrets.choice(["javdb9", "javdb30"]) + javdb_site = secrets.choice(["javdb31", "javdb32"]) + if config.getInstance().debug(): + print(f'[!]javdb:select site {javdb_site}') try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # pass # else: # number = number.upper() number = number.upper() - cookie_json = './' + javdb_site + '.json' + cookie_json = javdb_site + '.json' javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} # 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天 cookies_dict, cookies_filepath = load_cookies(cookie_json) @@ -326,6 +328,7 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b # main('DV-1562') # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") if __name__ == "__main__": + config.G_conf_override['debug_mode:switch'] = True # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) From bd3504f3b5ed200ab79e2783f7594ee35c4ffb53 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 19:32:00 +0800 Subject: [PATCH 19/56] javdb:only accept one login site after javdb site update --- WebCrawler/javdb.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 3a0a18d..841d8d6 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -213,27 +213,31 @@ def getSeries(a): return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number): - javdb_site = secrets.choice(["javdb31", "javdb32"]) - if config.getInstance().debug(): - print(f'[!]javdb:select site {javdb_site}') + # javdb更新后同一时间只能登录一个数字站,最新登录站会踢出旧的登录,因此按找到的第一个javdb*.json文件选择站点, + # 如果无.json文件则按选择最后一个站点。 + javdb_sites = ["javdb31", "javdb32"] + debug = config.getInstance().debug() try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # pass # else: # number = number.upper() number = number.upper() - cookie_json = javdb_site + '.json' javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} # 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天 - cookies_dict, cookies_filepath = load_cookies(cookie_json) - if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): - cdays = file_modification_days(cookies_filepath) - if cdays < 7: - javdb_cookies = cookies_dict - elif cdays != 9999: - print( -f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') - + for cj in javdb_sites: + javdb_site = cj + cookie_json = javdb_site + '.json' + cookies_dict, cookies_filepath = load_cookies(cookie_json) + if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): + cdays = file_modification_days(cookies_filepath) + if cdays < 7: + javdb_cookies = cookies_dict + elif cdays != 9999: + print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') + break + if debug: + print(f'[!]javdb:select site {javdb_site}') try: javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' query_result = get_html(javdb_url, cookies=javdb_cookies) @@ -338,3 +342,4 @@ if __name__ == "__main__": # print(main('FC2-1174949')) # not found print(main('MVSD-439')) # print(main('EHM0001')) # not found + print(main('FC2-2314275')) From 3873d1aa4cabb0eb690b2cd6f50a7f1eb181c07b Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 19:37:40 +0800 Subject: [PATCH 20/56] update user agent --- ADC_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ADC_function.py b/ADC_function.py index e755fb5..09fb11d 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -20,7 +20,7 @@ def getXpathSingle(htmlcode, xpath): return result1 -G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36' # 网页请求核心 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): From d010ea6d517e74895c40543fbf4decd05dad7f2d Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 19:42:11 +0800 Subject: [PATCH 21/56] =?UTF-8?q?=E6=B8=85=E7=90=86=E5=85=A8=E9=83=A8conf?= =?UTF-8?q?=E7=A9=BF=E6=A2=AD=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 23 ++++--- WebCrawler/__init__.py | 3 +- core.py | 139 +++++++++++++++++++++-------------------- 3 files changed, 85 insertions(+), 80 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 5def067..8e1cb76 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -347,10 +347,8 @@ def rm_empty_folder(path): def create_data_and_move(file_path: str, zero_op): # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 - c = config.getInstance() - debug = c.debug() - file_name = os.path.basename(file_path) - n_number = get_number(debug, file_name) + debug = config.getInstance().debug() + n_number = get_number(debug, os.path.basename(file_path)) file_path = os.path.abspath(file_path) if debug == True: @@ -358,7 +356,7 @@ def create_data_and_move(file_path: str, zero_op): if zero_op: return if n_number: - core_main(file_path, n_number, c) + core_main(file_path, n_number) else: print("[-] number empty ERROR") print("[*]======================================================") @@ -368,7 +366,7 @@ def create_data_and_move(file_path: str, zero_op): if zero_op: return if n_number: - core_main(file_path, n_number, c) + core_main(file_path, n_number) else: raise ValueError("number empty") print("[*]======================================================") @@ -377,17 +375,18 @@ def create_data_and_move(file_path: str, zero_op): print('[-]', err) try: - moveFailedFolder(file_path, conf) + moveFailedFolder(file_path) except Exception as err: print('[!]', err) -def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number): +def create_data_and_move_with_custom_number(file_path: str, custom_number): + conf = config.getInstance() file_name = os.path.basename(file_path) try: print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number)) if custom_number: - core_main(file_path, custom_number, c) + core_main(file_path, custom_number) else: print("[-] number empty ERROR") print("[*]======================================================") @@ -395,7 +394,7 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu print("[-] [{}] ERROR:".format(file_path)) print('[-]', err) - if c.soft_link(): + if conf.soft_link(): print("[-]Link {} to failed folder".format(file_path)) os.symlink(file_path, os.path.join(conf.failed_folder(), file_name)) else: @@ -455,9 +454,9 @@ if __name__ == '__main__': if not single_file_path == '': #Single File print('[+]==================== Single File =====================') if custom_number == '': - create_data_and_move_with_custom_number(single_file_path, conf, get_number(conf.debug(), os.path.basename(single_file_path))) + create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path))) else: - create_data_and_move_with_custom_number(single_file_path, conf, custom_number) + create_data_and_move_with_custom_number(single_file_path, custom_number) else: folder_path = conf.source_folder() if not isinstance(folder_path, str) or folder_path == '': diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index dc54b46..c5d02b5 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测 return True -def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据 +def get_data_from_json(file_number): # 从JSON返回元数据 """ iterate through all services and fetch the data """ @@ -53,6 +53,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 "fc2club": fc2club.main } + conf = config.getInstance() # default fetch order list, from the beginning to the end sources = conf.sources().split(',') if not len(conf.sources()) > 80: diff --git a/core.py b/core.py index 264d30b..94a8503 100755 --- a/core.py +++ b/core.py @@ -21,7 +21,8 @@ def escape_path(path, escape_literals: str): # Remove escape literals return path -def moveFailedFolder(filepath, conf): +def moveFailedFolder(filepath): + conf = config.getInstance() failed_folder = conf.failed_folder() soft_link = conf.soft_link() # 模式3或软连接,改为维护一个失败列表,启动扫描时加载用于排除该路径,以免反复处理 @@ -65,14 +66,15 @@ def get_info(json_data): # 返回json里的数据 return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label -def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config.Config, filepath): +def small_cover_check(path, number, cover_small, leak_word, c_word, filepath): filename = f"{number}{leak_word}{c_word}-poster.jpg" - download_file_with_filename(cover_small, filename, path, conf, filepath) + download_file_with_filename(cover_small, filename, path, filepath) print('[+]Image Downloaded! ' + os.path.join(path, filename)) -def create_folder(json_data, conf: config.Config): # 创建文件夹 +def create_folder(json_data): # 创建文件夹 title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) + conf = config.getInstance() success_folder = conf.success_folder() actor = json_data.get('actor') location_rule = eval(conf.location_rule(), json_data) @@ -104,7 +106,8 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹 # =====================资源下载部分=========================== # path = examle:photo , video.in the Project Folder! -def download_file_with_filename(url, filename, path, conf: config.Config, filepath): +def download_file_with_filename(url, filename, path, filepath): + conf = config.getInstance() configProxy = conf.proxy() for i in range(configProxy.retry): @@ -156,20 +159,20 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa print('[-]Image Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) except IOError: print(f"[-]Create Directory '{path}' failed!") - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return print('[-]Connect Failed! Please check your Proxy or Network!') - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return -def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: config.Config): - if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) == 'failed': +def trailer_download(trailer, leak_word, c_word, number, path, filepath): + if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) == 'failed': return - configProxy = conf.proxy() + configProxy = config.getInstance().proxy() for i in range(configProxy.retry): if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0: print('[!]Video Download Failed! Trying again. [{}/3]', i + 1) - download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) + download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) continue else: break @@ -178,20 +181,20 @@ def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: c print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4') # 剧照下载成功,否则移动到failed -def extrafanart_download(data, path, conf: config.Config, filepath): +def extrafanart_download(data, path, filepath): j = 1 - path = os.path.join(path, conf.get_extrafanart()) + path = os.path.join(path, config.getInstance().get_extrafanart()) + configProxy = config.getInstance().proxy() for url in data: jpg_filename = f'extrafanart-{j}.jpg' jpg_fullpath = os.path.join(path, jpg_filename) - if download_file_with_filename(url, jpg_filename, path, conf, filepath) == 'failed': - moveFailedFolder(filepath, conf) + if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed': + moveFailedFolder(filepath) return - configProxy = conf.proxy() for i in range(configProxy.retry): if os.path.getsize(jpg_fullpath) == 0: print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) - download_file_with_filename(url, jpg_filename, path, conf, filepath) + download_file_with_filename(url, jpg_filename, path, filepath) continue else: break @@ -203,18 +206,18 @@ def extrafanart_download(data, path, conf: config.Config, filepath): # 封面是否下载成功,否则移动到failed -def image_download(cover, number, leak_word, c_word, path, conf: config.Config, filepath): +def image_download(cover, number, leak_word, c_word, path, filepath): filename = f"{number}{leak_word}{c_word}-fanart.jpg" full_filepath = os.path.join(path, filename) - if download_file_with_filename(cover, filename, path, conf, filepath) == 'failed': - moveFailedFolder(filepath, conf) + if download_file_with_filename(cover, filename, path, filepath) == 'failed': + moveFailedFolder(filepath) return - configProxy = conf.proxy() + configProxy = config.getInstance().proxy() for i in range(configProxy.retry): if os.path.getsize(full_filepath) == 0: print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) - download_file_with_filename(cover, filename, path, conf, filepath) + download_file_with_filename(cover, filename, path, filepath) continue else: break @@ -224,9 +227,9 @@ def image_download(cover, number, leak_word, c_word, path, conf: config.Config, shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")) -def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf): +def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored): title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) - if conf.main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 + if config.getInstance().main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 nfo_path = str(Path(filepath).with_suffix('.nfo')) else: nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo") @@ -292,7 +295,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f print(" <num>" + number + "</num>", file=code) print(" <premiered>" + release + "</premiered>", file=code) print(" <cover>" + cover + "</cover>", file=code) - if conf.is_trailer(): + if config.getInstance().is_trailer(): print(" <trailer>" + trailer + "</trailer>", file=code) print(" <website>" + website + "</website>", file=code) print("</movie>", file=code) @@ -300,12 +303,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f except IOError as e: print("[-]Write Failed!") print("[-]", e) - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return except Exception as e1: print("[-]Write Failed!") print("[-]", e1) - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return @@ -334,7 +337,7 @@ def cutImage(imagecut, path, number, leak_word, c_word): # leak 流出 参数值为 1 0 # uncensored 无码 参数值为 1 0 # ========================================================================加水印 -def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Config): +def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored): mark_type = '' if cn_sub: mark_type += ',字幕' @@ -344,17 +347,17 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Conf mark_type += ',无码' if mark_type == '': return - add_mark_thread(thumb_path, cn_sub, leak, uncensored, conf) + add_mark_thread(thumb_path, cn_sub, leak, uncensored) print('[+]Thumb Add Mark: ' + mark_type.strip(',')) - add_mark_thread(poster_path, cn_sub, leak, uncensored, conf) + add_mark_thread(poster_path, cn_sub, leak, uncensored) print('[+]Poster Add Mark: ' + mark_type.strip(',')) -def add_mark_thread(pic_path, cn_sub, leak, uncensored, conf): +def add_mark_thread(pic_path, cn_sub, leak, uncensored): size = 14 img_pic = Image.open(pic_path) # 获取自定义位置,取余配合pos达到顺时针添加的效果 # 左上 0, 右上 1, 右下 2, 左下 3 - count = conf.watermark_type() + count = config.getInstance().watermark_type() if cn_sub == 1 or cn_sub == '1': add_to_pic(pic_path, img_pic, size, count, 1) # 添加 count = (count + 1) % 4 @@ -404,7 +407,7 @@ def add_to_pic(pic_path, img_pic, size, count, mode): img_pic.save(pic_path, quality=95) # ========================结束================================= -def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config.Config): # 文件路径,番号,后缀,要移动至的位置 +def paste_file_to_folder(filepath, path, number, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置 filepath_obj = pathlib.Path(filepath) houzhui = filepath_obj.suffix file_parent_origin_path = str(filepath_obj.parent) @@ -414,10 +417,11 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config # 同名覆盖致使全部文件损失且不可追回的最坏情况 if os.path.exists(targetpath): raise FileExistsError('File Exists on destination path, we will never overwriting.') + soft_link = config.getInstance().soft_link() # 如果soft_link=1 使用软链接 - if conf.soft_link() == 0: + if soft_link == 0: shutil.move(filepath, targetpath) - elif conf.soft_link() == 1: + elif soft_link == 1: # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持 # 相对路径径,改用绝对路径方式尝试建立软链接 try: @@ -425,7 +429,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config os.symlink(filerelpath, targetpath) except: os.symlink(filepath_obj.resolve(), targetpath) - elif conf.soft_link() == 2: + elif soft_link == 2: shutil.move(filepath, targetpath) # 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置 # 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪 @@ -434,7 +438,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config if targetabspath != os.path.abspath(filepath): targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path) os.symlink(targetrelpath, filepath) - sub_res = conf.sub_rule() + sub_res = config.getInstance().sub_rule() for subname in sub_res: sub_filepath = str(filepath_obj.with_suffix(subname)) @@ -445,7 +449,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config except FileExistsError as fee: print(f'[-]FileExistsError: {fee}') - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return except PermissionError: print('[-]Error! Please run as administrator!') @@ -455,7 +459,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config return -def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf): # 文件路径,番号,后缀,要移动至的位置 +def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置 if multi_part == 1: number += part # 这时number会被附加上CD1后缀 filepath_obj = pathlib.Path(filepath) @@ -465,12 +469,12 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo if os.path.exists(targetpath): raise FileExistsError('File Exists on destination path, we will never overwriting.') try: - if conf.soft_link(): + if config.getInstance().soft_link(): os.symlink(filepath, targetpath) else: shutil.move(filepath, targetpath) - sub_res = conf.sub_rule() + sub_res = config.getInstance().sub_rule() for subname in sub_res: sub_filepath = str(filepath_obj.with_suffix(subname)) if os.path.isfile(sub_filepath): # 字幕移动 @@ -488,7 +492,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo print(f'[-]OS Error errno {oserr.errno}') return -def get_part(filepath, conf): +def get_part(filepath): try: if re.search('-CD\d+', filepath): return re.findall('-CD\d+', filepath)[0] @@ -496,7 +500,7 @@ def get_part(filepath, conf): return re.findall('-cd\d+', filepath)[0] except: print("[-]failed!Please rename the filename again!") - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return @@ -516,7 +520,8 @@ def debug_print(data: json): pass -def core_main(file_path, number_th, conf: config.Config): +def core_main(file_path, number_th): + conf = config.getInstance() # =======================================================================初始化所需变量 multi_part = 0 part = '' @@ -530,11 +535,11 @@ def core_main(file_path, number_th, conf: config.Config): # 下面被注释的变量不需要 #rootpath= os.getcwd number = number_th - json_data = get_data_from_json(number, conf) # 定义番号 + json_data = get_data_from_json(number) # 定义番号 # Return if blank dict returned (data not found) if not json_data: - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return if json_data["number"] != number: @@ -549,7 +554,7 @@ def core_main(file_path, number_th, conf: config.Config): # =======================================================================判断-C,-CD后缀 if '-CD' in filepath or '-cd' in filepath: multi_part = 1 - part = get_part(filepath, conf) + part = get_part(filepath) if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath: cn_sub = '1' c_word = '-C' # 中文字幕影片后缀 @@ -573,7 +578,7 @@ def core_main(file_path, number_th, conf: config.Config): debug_print(json_data) # 创建文件夹 - #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data, conf) + #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data) # main_mode # 1: 刮削模式 / Scraping mode @@ -581,28 +586,28 @@ def core_main(file_path, number_th, conf: config.Config): # 3:不改变路径刮削 if conf.main_mode() == 1: # 创建文件夹 - path = create_folder(json_data, conf) + path = create_folder(json_data) if multi_part == 1: number += part # 这时number会被附加上CD1后缀 # 检查小封面, 如果image cut为3,则下载小封面 if imagecut == 3: - small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath) + small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath) # creatFolder会返回番号路径 - image_download( json_data.get('cover'), number, leak_word, c_word, path, conf, filepath) + image_download( json_data.get('cover'), number, leak_word, c_word, path, filepath) if not multi_part or part.lower() == '-cd1': try: # 下载预告片 if conf.is_trailer() and json_data.get('trailer'): - trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) + trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath) except: pass try: - # 下载剧照 data, path, conf: config.Config, filepath + # 下载剧照 data, path, filepath if conf.is_extrafanart() and json_data.get('extrafanart'): - extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) + extrafanart_download(json_data.get('extrafanart'), path, filepath) except: pass @@ -613,23 +618,23 @@ def core_main(file_path, number_th, conf: config.Config): poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): - add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + add_mark(poster_path, thumb_path, cn_sub, leak, uncensored) # 移动电影 - paste_file_to_folder(filepath, path, number, leak_word, c_word, conf) + paste_file_to_folder(filepath, path, number, leak_word, c_word) # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 - print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf) + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored) elif conf.main_mode() == 2: # 创建文件夹 - path = create_folder(json_data, conf) + path = create_folder(json_data) # 移动文件 - paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf) + paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word) poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): - add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + add_mark(poster_path, thumb_path, cn_sub, leak, uncensored) elif conf.main_mode() == 3: path = str(Path(file_path).parent) @@ -638,19 +643,19 @@ def core_main(file_path, number_th, conf: config.Config): # 检查小封面, 如果image cut为3,则下载小封面 if imagecut == 3: - small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath) + small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath) # creatFolder会返回番号路径 - image_download(json_data.get('cover'), number, leak_word, c_word, path, conf, filepath) + image_download(json_data.get('cover'), number, leak_word, c_word, path, filepath) if not multi_part or part.lower() == '-cd1': # 下载预告片 if conf.is_trailer() and json_data.get('trailer'): - trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) + trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath) - # 下载剧照 data, path, conf: config.Config, filepath + # 下载剧照 data, path, filepath if conf.is_extrafanart() and json_data.get('extrafanart'): - extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) + extrafanart_download(json_data.get('extrafanart'), path, filepath) # 裁剪图 cutImage(imagecut, path, number, leak_word, c_word) @@ -659,8 +664,8 @@ def core_main(file_path, number_th, conf: config.Config): poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): - add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + add_mark(poster_path, thumb_path, cn_sub, leak, uncensored) # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, - tag, json_data.get('actor_list'), liuchu, uncensored, conf) + tag, json_data.get('actor_list'), liuchu, uncensored) From b0959d1b18f931b052f3c3067fe13e578ff75d9e Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sat, 9 Oct 2021 20:29:17 +0800 Subject: [PATCH 22/56] =?UTF-8?q?javdb:=E6=97=A0=E6=9C=89=E6=95=88?= =?UTF-8?q?=E6=9C=9F=E5=86=85cookies=E6=96=87=E4=BB=B6=E6=97=B6=EF=BC=8C?= =?UTF-8?q?=E9=9A=8F=E6=9C=BA=E9=80=89=E6=8B=A9=E4=B8=80=E4=B8=AA=E7=AB=99?= =?UTF-8?q?=E7=82=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javdb.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 841d8d6..7d69404 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -214,7 +214,7 @@ def getSeries(a): def main(number): # javdb更新后同一时间只能登录一个数字站,最新登录站会踢出旧的登录,因此按找到的第一个javdb*.json文件选择站点, - # 如果无.json文件则按选择最后一个站点。 + # 如果无.json文件或者超过有效期,则随机选择一个站点。 javdb_sites = ["javdb31", "javdb32"] debug = config.getInstance().debug() try: @@ -225,6 +225,7 @@ def main(number): number = number.upper() javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} # 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天 + has_json = False for cj in javdb_sites: javdb_site = cj cookie_json = javdb_site + '.json' @@ -233,9 +234,12 @@ def main(number): cdays = file_modification_days(cookies_filepath) if cdays < 7: javdb_cookies = cookies_dict + has_json = True + break elif cdays != 9999: print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') - break + if not has_json: + javdb_site = secrets.choice(javdb_sites) if debug: print(f'[!]javdb:select site {javdb_site}') try: From 0933e87944afabc1cdb18c26b272a60fa4554d33 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 10 Oct 2021 17:41:33 +0800 Subject: [PATCH 23/56] fix outline of javbus and javdb which caused by airav down --- ADC_function.py | 4 ++-- WebCrawler/javbus.py | 21 +++++++++++++++++++-- WebCrawler/javdb.py | 6 +++++- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 09fb11d..4480852 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -103,7 +103,7 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: return result.text -def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) if isinstance(cookies, dict): requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) @@ -113,7 +113,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d result = browser.open(url) if not result.ok: return '' - form = browser.select_form() if form_name is None else browser.select_form(form_name) + form = browser.select_form() if form_select is None else browser.select_form(form_select) if isinstance(fields, dict): for k, v in fields.items(): browser[k] = v diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 1af4359..c2ff11e 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -80,7 +80,7 @@ def getCID(htmlcode): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result -def getOutline(number): #获取剧情介绍 +def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 try: @@ -91,6 +91,23 @@ def getOutline(number): #获取剧情介绍 except: pass return '' +def getOutline(number): #获取剧情介绍 从avno1.cc取得 + try: + number_up = number.upper() + result, browser = get_html_by_form('http://www.avno1.cc/cn/usercenter.php?item=pay_support', + form_select='div.wrapper > div.header > div.search > form', + fields = {'kw' : number_up}, + return_type = 'browser') + if not result.ok: + raise + title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() + page_number = title[title.rfind(' '):].upper() + if not number_up in page_number: + raise + return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() + except: + pass + return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 @@ -198,7 +215,7 @@ def main(number): return js if __name__ == "__main__" : - print(main('ADV-R0624')) # 404 + #print(main('ADV-R0624')) # 404 print(main('ipx-292')) print(main('CEMD-011')) print(main('CJOD-278')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 7d69404..358682d 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -196,7 +196,7 @@ def getDirector(a): result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(number): #获取剧情介绍 +def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 try: htmlcode = get_html('https://cn.airav.wiki/video/' + number) from WebCrawler.airav import getOutline as airav_getOutline @@ -205,6 +205,9 @@ def getOutline(number): #获取剧情介绍 except: pass return '' +def getOutline(number): #获取剧情介绍 + from WebCrawler.javbus import getOutline as javbus_getOutline + return javbus_getOutline(number) def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -340,6 +343,7 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) + print(main('070116-197')) print(main('093021_539')) # 没有剧照 片商pacopacomama # print(main('FC2-2278260')) # print(main('FC2-735670')) From e5abac9138ee630d4cf978d7644a391fd9677e77 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 10 Oct 2021 18:02:53 +0800 Subject: [PATCH 24/56] add download_only_missing_image config item --- config.ini | 1 + config.py | 3 +++ core.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/config.ini b/config.ini index f33a578..06eda0c 100755 --- a/config.ini +++ b/config.ini @@ -18,6 +18,7 @@ nfo_skip_days=30 stop_counter=0 ; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁 ignore_failed_list=0 +download_only_missing_images=1 [proxy] ;proxytype: http or socks5 or socks5h switch: 0 1 diff --git a/config.py b/config.py index 2b49ca0..3b325d9 100644 --- a/config.py +++ b/config.py @@ -141,6 +141,8 @@ class Config: return 0 def ignore_failed_list(self) -> bool: return self.getboolean_override("common", "ignore_failed_list") + def download_only_missing_images(self) -> bool: + return self.conf.getboolean("common", "download_only_missing_images") def is_transalte(self) -> bool: return self.conf.getboolean("transalte", "switch") def is_trailer(self) -> bool: @@ -264,6 +266,7 @@ class Config: conf.set(sec1, "nfo_skip_days", 30) conf.set(sec1, "stop_counter", 0) conf.set(sec1, "ignore_failed_list", 0) + conf.set(sec1, "download_only_missing_images", 1) sec2 = "proxy" conf.add_section(sec2) diff --git a/core.py b/core.py index 94a8503..6a8af37 100755 --- a/core.py +++ b/core.py @@ -183,11 +183,15 @@ def trailer_download(trailer, leak_word, c_word, number, path, filepath): # 剧照下载成功,否则移动到failed def extrafanart_download(data, path, filepath): j = 1 - path = os.path.join(path, config.getInstance().get_extrafanart()) - configProxy = config.getInstance().proxy() + conf = config.getInstance() + path = os.path.join(path, conf.get_extrafanart()) + configProxy = conf.proxy() + download_only_missing_images = conf.download_only_missing_images() for url in data: jpg_filename = f'extrafanart-{j}.jpg' jpg_fullpath = os.path.join(path, jpg_filename) + if download_only_missing_images and os.path.isfile(jpg_fullpath) and os.path.getsize(jpg_fullpath): + continue if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed': moveFailedFolder(filepath) return @@ -209,6 +213,8 @@ def extrafanart_download(data, path, filepath): def image_download(cover, number, leak_word, c_word, path, filepath): filename = f"{number}{leak_word}{c_word}-fanart.jpg" full_filepath = os.path.join(path, filename) + if config.getInstance().download_only_missing_images() and os.path.isfile(full_filepath) and os.path.getsize(full_filepath): + return if download_file_with_filename(cover, filename, path, filepath) == 'failed': moveFailedFolder(filepath) return From 678a8f9bc817c32b46e551a22ee886ebed2f42a8 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Mon, 11 Oct 2021 10:24:46 +0800 Subject: [PATCH 25/56] Add signal handler --- AV_Data_Capture.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 8e1cb76..02ac84b 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -6,6 +6,7 @@ import sys import shutil import typing import urllib3 +import signal import config from datetime import datetime, timedelta @@ -223,6 +224,15 @@ def close_logfile(logdir: str): # 100MB的日志文件能缩小到3.7MB。 +def signal_handler(*args): + print('[!]Ctrl+C detected, Exit.') + sys.exit(9) + +def sigdebug_handler(*args): + config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"] + print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF')) + + # 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告 def movie_lists(source_folder, regexstr): conf = config.getInstance() @@ -420,6 +430,11 @@ if __name__ == '__main__': print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") sys.exit(4) + signal.signal(signal.SIGINT, signal_handler) + if sys.platform == 'win32': + signal.signal(signal.SIGBREAK, sigdebug_handler) + else: + signal.signal(signal.SIGWINCH, sigdebug_handler) dupe_stdout_to_logfile(logdir) print('[*]================== AV Data Capture ===================') From f8dc05a38bad656a5d5ed186ea84ad0cce2ebc43 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 12 Oct 2021 11:28:17 +0800 Subject: [PATCH 26/56] improve javbus and javdb outline source --- ADC_function.py | 2 +- WebCrawler/javbus.py | 12 +++++++++++- WebCrawler/javdb.py | 1 - WebCrawler/xcity.py | 13 ++++++++----- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 4480852..ed428bd 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -2,7 +2,7 @@ from os import replace import requests import hashlib from pathlib import Path -#import secrets +import secrets import os.path import uuid import json diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index c2ff11e..e739424 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -93,8 +93,12 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 return '' def getOutline(number): #获取剧情介绍 从avno1.cc取得 try: + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), + '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 number_up = number.upper() - result, browser = get_html_by_form('http://www.avno1.cc/cn/usercenter.php?item=pay_support', + result, browser = get_html_by_form(url, form_select='div.wrapper > div.header > div.search > form', fields = {'kw' : number_up}, return_type = 'browser') @@ -107,6 +111,12 @@ def getOutline(number): #获取剧情介绍 从avno1.cc取得 return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() except: pass + from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline + try: + detail_html, browser = open_by_browser(number_up) + return xcity_getOutline(detail_html) + except: + pass return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 358682d..4b0d4c9 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -5,7 +5,6 @@ from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * -import secrets # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 858dd54..4bbdec1 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -181,11 +181,10 @@ def getExtrafanart(htmlcode): # 获取剧照 return s return '' -def main(number): - try: +def open_by_browser(number): xcity_number = number.replace('-','') query_result, browser = get_html_by_form( - 'https://xcity.jp/about/', + 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), fields = {'q' : xcity_number.lower()}, return_type = 'browser') if not query_result or not query_result.ok: @@ -193,12 +192,16 @@ def main(number): result = browser.follow_link(browser.links('avod\/detail')[0]) if not result.ok: raise ValueError("xcity.py: detail page not found") - detail_page = str(browser.page) + return str(browser.page), browser + +def main(number): + try: + detail_page, browser = open_by_browser(number) url = browser.url newnum = getNum(detail_page).upper() number_up = number.upper() if newnum != number_up: - if newnum == xcity_number.upper(): + if newnum == number.replace('-','').upper(): newnum = number_up else: raise ValueError("xcity.py: number not found") From c0a4ce638c0bb86de8506d03926ea1cb82361833 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 12 Oct 2021 11:29:53 +0800 Subject: [PATCH 27/56] call moveFailedFolder when empty number on debug branch --- AV_Data_Capture.py | 1 + 1 file changed, 1 insertion(+) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 02ac84b..9ae551b 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -369,6 +369,7 @@ def create_data_and_move(file_path: str, zero_op): core_main(file_path, n_number) else: print("[-] number empty ERROR") + moveFailedFolder(file_path) print("[*]======================================================") else: try: From f26987ddf96cac2556137a80cc7e21a953b26883 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 12 Oct 2021 11:42:30 +0800 Subject: [PATCH 28/56] move into try block --- WebCrawler/javbus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index e739424..46628cf 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -111,9 +111,9 @@ def getOutline(number): #获取剧情介绍 从avno1.cc取得 return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() except: pass - from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline try: - detail_html, browser = open_by_browser(number_up) + from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline + detail_html, browser = open_by_browser(number) return xcity_getOutline(detail_html) except: pass From 317449c568fb66ac7280a782459f2c1dd604d5a9 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 15 Oct 2021 09:11:40 +0800 Subject: [PATCH 29/56] try fix issue 616: onedrive OSError input/output --- ADC_function.py | 3 +++ core.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index ed428bd..30c2ab9 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -622,3 +622,6 @@ def file_modification_days(filename) -> int: if days < 0: return 9999 return days + +def file_not_exist_or_empty(filepath) -> bool: + return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0 diff --git a/core.py b/core.py index 6a8af37..f38c6f1 100755 --- a/core.py +++ b/core.py @@ -170,13 +170,13 @@ def trailer_download(trailer, leak_word, c_word, number, path, filepath): return configProxy = config.getInstance().proxy() for i in range(configProxy.retry): - if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0: + if file_not_exist_or_empty(path+'/' + number + leak_word + c_word + '-trailer.mp4'): print('[!]Video Download Failed! Trying again. [{}/3]', i + 1) download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) continue else: break - if os.path.getsize(path + '/' + number + leak_word + c_word + '-trailer.mp4') == 0: + if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + '-trailer.mp4'): return print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4') @@ -190,19 +190,19 @@ def extrafanart_download(data, path, filepath): for url in data: jpg_filename = f'extrafanart-{j}.jpg' jpg_fullpath = os.path.join(path, jpg_filename) - if download_only_missing_images and os.path.isfile(jpg_fullpath) and os.path.getsize(jpg_fullpath): + if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): continue if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed': moveFailedFolder(filepath) return for i in range(configProxy.retry): - if os.path.getsize(jpg_fullpath) == 0: + if file_not_exist_or_empty(jpg_fullpath): print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) download_file_with_filename(url, jpg_filename, path, filepath) continue else: break - if os.path.getsize(jpg_fullpath) == 0: + if file_not_exist_or_empty(jpg_fullpath): return print('[+]Image Downloaded!', jpg_fullpath) j += 1 @@ -213,7 +213,7 @@ def extrafanart_download(data, path, filepath): def image_download(cover, number, leak_word, c_word, path, filepath): filename = f"{number}{leak_word}{c_word}-fanart.jpg" full_filepath = os.path.join(path, filename) - if config.getInstance().download_only_missing_images() and os.path.isfile(full_filepath) and os.path.getsize(full_filepath): + if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath): return if download_file_with_filename(cover, filename, path, filepath) == 'failed': moveFailedFolder(filepath) @@ -221,13 +221,13 @@ def image_download(cover, number, leak_word, c_word, path, filepath): configProxy = config.getInstance().proxy() for i in range(configProxy.retry): - if os.path.getsize(full_filepath) == 0: + if file_not_exist_or_empty(full_filepath): print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) download_file_with_filename(cover, filename, path, filepath) continue else: break - if os.path.getsize(full_filepath) == 0: + if file_not_exist_or_empty(full_filepath): return print('[+]Image Downloaded!', full_filepath) shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")) From 416e8be351ce5e9d70b2f4b47cf70c63f44cb724 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 15 Oct 2021 10:07:53 +0800 Subject: [PATCH 30/56] merge PR#612 --- core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core.py b/core.py index f38c6f1..ae73af8 100755 --- a/core.py +++ b/core.py @@ -85,8 +85,8 @@ def create_folder(json_data): # 创建文件夹 if 'title' in conf.location_rule() and len(title) > maxlen: shorttitle = title[0:maxlen] location_rule = location_rule.replace(title, shorttitle) - - path = os.path.join(success_folder, location_rule).strip() + # 当演员为空时,location_rule被计算为'/number'绝对路径,导致路径连接忽略第一个路径参数,因此添加./使其始终为相对路径 + path = os.path.join(success_folder, f'./{location_rule.strip()}') if not os.path.exists(path): path = escape_path(path, conf.escape_literals()) try: From 7f8d500b134ed286336b9d68a9b71c5c93c1e204 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 15 Oct 2021 21:00:32 +0800 Subject: [PATCH 31/56] correction mechanicalsoup browser with cookies calling method --- ADC_function.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 30c2ab9..e5afb4b 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -85,7 +85,12 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + if isinstance(cookies, dict) and len(cookies): + s = requests.Session() + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) + else: + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() @@ -104,9 +109,12 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) - if isinstance(cookies, dict): - requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) + if isinstance(cookies, dict) and len(cookies): + s = requests.Session() + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) + else: + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() From 189f4db6161f6393cbd5d3c1c204153a3f0f7b26 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Fri, 15 Oct 2021 21:16:48 +0800 Subject: [PATCH 32/56] javdb:get faster benefit from http keep-alive --- WebCrawler/javdb.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 4b0d4c9..185d96b 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -1,4 +1,6 @@ import sys + +from mechanicalsoup.stateful_browser import StatefulBrowser sys.path.append('../') import re from lxml import etree @@ -246,7 +248,10 @@ def main(number): print(f'[!]javdb:select site {javdb_site}') try: javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' - query_result = get_html(javdb_url, cookies=javdb_cookies) + res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser') + if not res.ok: + raise + query_result = res.text except: query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies) html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -267,8 +272,11 @@ def main(number): raise ValueError("number not found") correct_url = urls[0] try: - javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url - detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) + if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive + detail_page = browser.open_relative(correct_url).text + else: + javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url + detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) except: detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) @@ -344,8 +352,8 @@ if __name__ == "__main__": # print(main('BANK-022')) print(main('070116-197')) print(main('093021_539')) # 没有剧照 片商pacopacomama - # print(main('FC2-2278260')) - # print(main('FC2-735670')) + print(main('FC2-2278260')) + print(main('FC2-735670')) # print(main('FC2-1174949')) # not found print(main('MVSD-439')) # print(main('EHM0001')) # not found From b006aee34d1382e3494b6e94d4e2156e80ead7c9 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 17 Oct 2021 21:21:12 +0800 Subject: [PATCH 33/56] failed_list.txt keep order remove duplication --- AV_Data_Capture.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 9ae551b..d9c54b2 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -252,14 +252,14 @@ def movie_lists(source_folder, regexstr): failed_set = set() if (main_mode == 3 or soft_link) and not conf.ignore_failed_list(): try: - with open(failed_list_txt_path, 'r', encoding='utf-8') as flt: - flist = flt.read().splitlines() - failed_set = set(flist) - if len(flist) != len(failed_set): - with open(failed_list_txt_path, 'w', encoding='utf-8') as flt: - wtlines = [line + '\n' for line in failed_set] - wtlines.sort() - flt.writelines(wtlines) + flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines() + failed_set = set(flist) + if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的 + fset = failed_set.copy() + for i in range(len(flist)-1, -1, -1): + fset.remove(flist[i]) if flist[i] in fset else flist.pop(i) + failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8') + assert len(fset) == 0 and len(flist) == len(failed_set) except: pass if not Path(source_folder).is_dir(): From a546c4e83e595f7b1a90a9c37942a1f6162d0e2a Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 17 Oct 2021 21:59:08 +0800 Subject: [PATCH 34/56] Parall query on storyline data --- ADC_function.py | 10 +- AV_Data_Capture.py | 8 +- WebCrawler/javbus.py | 46 +++---- WebCrawler/javdb.py | 25 ++-- WebCrawler/storyline.py | 270 ++++++++++++++++++++++++++++++++++++++++ config.ini | 11 +- config.py | 19 +++ 7 files changed, 336 insertions(+), 53 deletions(-) create mode 100644 WebCrawler/storyline.py diff --git a/ADC_function.py b/ADC_function.py index e5afb4b..e43fe5f 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -85,12 +85,11 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): + s = None if isinstance(cookies, dict) and len(cookies): s = requests.Session() requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) - else: - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() @@ -109,12 +108,11 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): + s = None if isinstance(cookies, dict) and len(cookies): s = requests.Session() requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) - else: - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index d9c54b2..6c13e5d 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -416,7 +416,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number): print('[!]', err) -if __name__ == '__main__': +def main(): version = '5.0.1' urllib3.disable_warnings() #Ignore http proxy warning @@ -483,6 +483,7 @@ if __name__ == '__main__': count = 0 count_all = str(len(movie_list)) print('[+]Find', count_all, 'movies.') + print('[*]======================================================') stop_count = conf.stop_counter() if stop_count<1: stop_count = 999999 @@ -517,3 +518,8 @@ if __name__ == '__main__': input("Press enter key exit, you can check the error message before you exit...") sys.exit(0) + +import multiprocessing +if __name__ == '__main__': + multiprocessing.freeze_support() + main() diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 46628cf..c9d53f3 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -6,6 +6,7 @@ from lxml import etree#need install from bs4 import BeautifulSoup#need install import json from ADC_function import * +from WebCrawler.storyline import getStoryline import inspect def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img @@ -91,33 +92,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 except: pass return '' -def getOutline(number): #获取剧情介绍 从avno1.cc取得 - try: - url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + - secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), - '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' - ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 - number_up = number.upper() - result, browser = get_html_by_form(url, - form_select='div.wrapper > div.header > div.search > form', - fields = {'kw' : number_up}, - return_type = 'browser') - if not result.ok: - raise - title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() - page_number = title[title.rfind(' '):].upper() - if not number_up in page_number: - raise - return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() - except: - pass - try: - from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline - detail_html, browser = open_by_browser(number) - return xcity_getOutline(detail_html) - except: - pass - return '' +def getOutline(number, title): #获取剧情介绍 多进程并发查询 + return getStoryline(number,title) def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) # 如果记录中冇导演,系列排在第6位 @@ -156,11 +132,12 @@ def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') + title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','') dic = { - 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), + 'title': title, 'studio': getStudio(htmlcode), 'year': getYear(htmlcode), - 'outline': getOutline(number), + 'outline': getOutline(number, title), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), @@ -189,11 +166,12 @@ def main(number): htmlcode = get_html('https://www.javbus.com/' + number) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') + title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode))) dic = { - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), + 'title': title, 'studio': getStudio(htmlcode), 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), - 'outline': getOutline(number), + 'outline': getOutline(number, title), 'runtime': getRuntime(htmlcode), 'director': getDirector(htmlcode), 'actor': getActor(htmlcode), @@ -225,7 +203,11 @@ def main(number): return js if __name__ == "__main__" : - #print(main('ADV-R0624')) # 404 + config.G_conf_override['debug_mode:switch'] = True + print(main('ABP-888')) + print(main('ABP-960')) + # print(main('ADV-R0624')) # 404 + # print(main('MMNT-010')) print(main('ipx-292')) print(main('CEMD-011')) print(main('CJOD-278')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 185d96b..241de49 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -1,13 +1,11 @@ import sys - -from mechanicalsoup.stateful_browser import StatefulBrowser sys.path.append('../') import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * -# import sys +from mechanicalsoup.stateful_browser import StatefulBrowser +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) @@ -206,9 +204,8 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 except: pass return '' -def getOutline(number): #获取剧情介绍 - from WebCrawler.javbus import getOutline as javbus_getOutline - return javbus_getOutline(number) +def getOutline(number, title): #获取剧情介绍 多进程并发查询 + return getStoryline(number,title) def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -309,7 +306,7 @@ def main(number): 'actor': getActor(detail_page), 'title': title, 'studio': getStudio(detail_page), - 'outline': getOutline(number), + 'outline': getOutline(number, title), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), @@ -350,11 +347,13 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) - print(main('070116-197')) - print(main('093021_539')) # 没有剧照 片商pacopacomama - print(main('FC2-2278260')) - print(main('FC2-735670')) + # print(main('070116-197')) + # print(main('093021_539')) # 没有剧照 片商pacopacomama + # print(main('FC2-2278260')) + # print(main('FC2-735670')) # print(main('FC2-1174949')) # not found print(main('MVSD-439')) # print(main('EHM0001')) # not found - print(main('FC2-2314275')) + # print(main('FC2-2314275')) + # print(main('EBOD-646')) + print(main('LOVE-262')) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py new file mode 100644 index 0000000..11142fc --- /dev/null +++ b/WebCrawler/storyline.py @@ -0,0 +1,270 @@ +import sys +sys.path.append('../') +import re +import json +from ADC_function import * +from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline +from multiprocessing import Pool +from difflib import SequenceMatcher +from unicodedata import category + +G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"} + + +# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 +def getStoryline(number, title): + start_time = time.time() + conf = config.getInstance() + debug = conf.debug() or conf.storyline_show() == 2 + storyine_sites = conf.storyline_site().split(',') + apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] + mp_args = ((site, number, title, debug) for site in apply_sites) + # choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/ + with Pool() as proc_pool: + result = proc_pool.map(getStoryline_mp, mp_args) + if not debug and conf.storyline_show() == 0: + for value in result: + if isinstance(value, str) and len(value): + return value + return '' + # 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示 + cnt = len(apply_sites) + s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' + first = True + sel = '' + for i in range(cnt): + sl = len(result[i])if isinstance(result[i], str) else 0 + if sl and first: + s += f',[选中结果{apply_sites[i]}字数:{sl}]' + first = False + sel = result[i] + elif sl: + s += f',{apply_sites[i]}字数:{sl}' + else: + s += f',{apply_sites[i]}:空' + print(s) + return sel + + +def getStoryline_mp(args): + return _getStoryline_mp(*args) + + +# 注:新进程的print()不会写入日志中,将来调试修复失效数据源需直接查看标准输出,issue信息需截图屏幕 +def _getStoryline_mp(site, number, title, debug): + start_time = time.time() + storyline = None + if not isinstance(site, str): + return storyline + elif site == "airav": + storyline = getStoryline_airav(number, debug) + elif site == "avno1": + storyline = getStoryline_avno1(number, debug) + elif site == "xcity": + storyline = getStoryline_xcity(number, debug) + elif site == "amazon": + storyline = getStoryline_amazon(title, number, debug) + if not debug: + return storyline + print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( + site, + time.time() - start_time, + time.strftime("%H:%M:%S"), + storyline if isinstance(storyline, str) and len(storyline) else '[空]') + ) + return storyline + + +def getStoryline_airav(number, debug): + try: + number_up = number + site = secrets.choice(('airav.cc','airav4.club')) + url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' + res, browser = get_html_by_browser(url, return_type='browser') + if not res.ok: + raise ValueError(f"get_html_by_browser('{url}') failed") + avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div') + if number_up not in avs.select_one('a > h3').text.upper(): + raise ValueError("number not found") + detail_url = avs.select_one('a')['href'] + res = browser.open_relative(detail_url) + if not res.ok: + raise ValueError(f"browser.open_relative('{detail_url}') failed") + t = browser.page.select_one('head > title').text + airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper() + if number.upper() != airav_number: + raise ValueError(f"page number ->[{airav_number}] not match") + desc = browser.page.select_one('li.introduction > span').text.strip() + return desc + except Exception as e: + if debug: + print(f"[-]MP getOutline_amazon Error: {e},number [{number}].") + pass + return None + + +def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 + try: + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), + '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 + number_up = number.upper() + result, browser = get_html_by_form(url, + form_select='div.wrapper > div.header > div.search > form', + fields = {'kw' : number_up}, + return_type = 'browser') + if not result.ok: + raise ValueError(f"get_html_by_form('{url}','{number_up}') failed") + title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() + page_number = title[title.rfind(' '):].upper() + if not number_up in page_number: + raise ValueError(f"page number ->[{page_number}] not match") + return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 + try: + #xcity_number = number.replace('-','') + query_result, browser = get_html_by_form( + 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), + fields = {'q' : xcity_number.lower()}, + return_type = 'browser') + if not query_result or not query_result.ok: + raise ValueError("page not found") + result = browser.follow_link(browser.links('avod\/detail')[0]) + if not result.ok: + raise ValueError("detail page not found") + return browser.page.select_one('h2.title-detail + p.lead').text.strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_amazon(q_title, number, debug): + if not isinstance(q_title, str) or not len(q_title): + return None + try: + amazon_cookie, _ = load_cookies('amazon.json') + cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None + url = "https://www.amazon.co.jp/s?k=" + q_title + res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser') + if not res.ok: + raise ValueError("get_html_by_browser() failed") + lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') + if isinstance(lks, list) and len(lks): + browser.follow_link(lks[0]) + cookie = None + html = etree.fromstring(str(browser.page), etree.HTMLParser()) + titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") + urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") + if not len(urls) or len(urls) != len(titles): + raise ValueError("titles not found") + idx = amazon_select_one(titles, q_title, number, debug) + if not isinstance(idx, int) or idx < 0: + raise ValueError("title and number not found") + furl = urls[idx] + r = browser.open_relative(furl) + if not r.ok: + raise ValueError("browser.open_relative()) failed.") + lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') + if isinstance(lks, list) and len(lks): + browser.follow_link(lks[0]) + cookie = None + + ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip() + ama_t = re.sub(r'審査番号:\d+', '', ama_t) + + if cookie is None: + # 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径 + ama_save = Path.home() / ".local/share/avdc/amazon.json" + ama_save.parent.mkdir(parents=True, exist_ok=True) + ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') + + return ama_t + + except Exception as e: + if debug: + print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}') + pass + return None + +# 查货架中DVD和蓝光商品中标题相似度高的 +def amazon_select_one(a_titles, q_title, number, debug): + sel = -1 + ratio = 0 + que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A)) + for loc in range(len(a_titles)): + t = a_titles[loc] + if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 + ratio = 1.0 + sel = loc + save_t_ = t + break + if not re.search('DVD|Blu-ray', t, re.I): + continue + ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I)) + ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A)) + findlen = 0 + lastpos = -1 + cnt = len(ama_t) + for c in reversed(ama_t): + cnt -= 1 + pos = que_t.rfind(c) + if lastpos >= 0: + pos_near = que_t[:lastpos].rfind(c) + if pos_near < 0: + findlen = 0 + lastpos = -1 + ama_t = ama_t[:cnt+1] + else: + pos = pos_near + if pos < 0: + if category(c) == 'Nd': + return -1 + ama_t = ama_t[:cnt] + findlen = 0 + lastpos = -1 + continue + if findlen > 0 and len(que_t) > 1 and lastpos == pos+1: + findlen += 1 + lastpos = pos + if findlen >= 4: + break + continue + findlen = 1 + lastpos = pos + if findlen==0: + return -1 + r = SequenceMatcher(None, ama_t, que_t).ratio() + if r > ratio: + sel = loc + ratio = r + save_t_ = ama_t + if ratio > 0.999: + break + + if ratio < 0.5: + return -1 + + if not debug: + # 目前采信相似度高于0.9的结果 + return sel if ratio >= 0.9 else -1 + + # debug 模式下记录识别准确率日志 + if ratio < 0.9: + # 相似度[0.5, 0.9)的淘汰结果单独记录日志 + (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write( + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}') + return -1 + # 被采信的结果日志 + (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write( + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}') + return sel diff --git a/config.ini b/config.ini index 06eda0c..5125ad3 100755 --- a/config.ini +++ b/config.ini @@ -7,7 +7,7 @@ soft_link=0 failed_move=1 auto_exit=0 transalte_to_sc=0 -multi_threading=1 +multi_threading=0 ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) actor_gender=female del_empty_folder=1 @@ -85,3 +85,12 @@ water=2 switch=0 extrafanart_folder=extrafanart +; 剧情简介 +[storyline] +; website为javbus或javdb时,site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询,取值优先级 +; 从左到右,靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介,xcity和amazon是日语的,由 +; 于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。 +; site= +site=airav,avno1,xcity,amazon +; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 +show_result=0 diff --git a/config.py b/config.py index 3b325d9..3226a55 100644 --- a/config.py +++ b/config.py @@ -240,6 +240,20 @@ class Config: def debug(self) -> bool: return self.getboolean_override("debug_mode", "switch") + def storyline_site(self) -> str: + try: + return self.conf.get("storyline", "site") + except: + return "airav,avno1,xcity,amazon" + + def storyline_show(self) -> int: + try: + v = self.conf.getint("storyline", "show_result") + return v if v in (0,1,2) else 2 if v > 2 else 0 + except: + return 0 + + @staticmethod def _exit(sec: str) -> None: print("[-] Read config error! Please check the {} section in config.ini", sec) @@ -333,6 +347,11 @@ class Config: conf.set(sec13, "switch", 1) conf.set(sec13, "extrafanart_folder", "extrafanart") + sec14 = "storyline" + conf.add_section(sec14) + conf.set(sec14, "site", "airav,avno1,xcity,amazon") + conf.set(sec14, "show_result", 0) + return conf From bc3cda953d2f4636a149b7f73b6110f47707b965 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 17 Oct 2021 22:29:57 +0800 Subject: [PATCH 35/56] fix --- WebCrawler/storyline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 11142fc..567c675 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -130,7 +130,7 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 try: - #xcity_number = number.replace('-','') + xcity_number = number.replace('-','') query_result, browser = get_html_by_form( 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), fields = {'q' : xcity_number.lower()}, From 6624ed7224df104c76f6f577b6a76dd604ea4997 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 17 Oct 2021 22:47:49 +0800 Subject: [PATCH 36/56] clean up --- WebCrawler/storyline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 567c675..5923d7d 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -3,7 +3,6 @@ sys.path.append('../') import re import json from ADC_function import * -from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline from multiprocessing import Pool from difflib import SequenceMatcher from unicodedata import category From 3420f918f50137d8e8ba7e23e7d5f490198a0c76 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 17 Oct 2021 22:53:53 +0800 Subject: [PATCH 37/56] fix ratio.txt log lost newline --- WebCrawler/storyline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 5923d7d..5d74d4e 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -261,9 +261,9 @@ def amazon_select_one(a_titles, q_title, number, debug): if ratio < 0.9: # 相似度[0.5, 0.9)的淘汰结果单独记录日志 (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write( - f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}') + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n') return -1 # 被采信的结果日志 (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write( - f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}') + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n') return sel From 56bbfe6f240cbc3b3e2dd533513af4cce78f79e7 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Sun, 17 Oct 2021 23:25:19 +0800 Subject: [PATCH 38/56] storyline.py: skip SequenceMatcher when number match --- WebCrawler/storyline.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 5d74d4e..b32778d 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -203,10 +203,7 @@ def amazon_select_one(a_titles, q_title, number, debug): for loc in range(len(a_titles)): t = a_titles[loc] if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 - ratio = 1.0 - sel = loc - save_t_ = t - break + return loc if not re.search('DVD|Blu-ray', t, re.I): continue ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I)) From c9b96f65ab37d48e3d1b34585207aca051598152 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Mon, 18 Oct 2021 08:47:11 +0800 Subject: [PATCH 39/56] one line file copy --- config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config.py b/config.py index 3226a55..5624e85 100644 --- a/config.py +++ b/config.py @@ -73,8 +73,7 @@ class Config: # 用户目录才确定具有写权限,因此选择 ~/avdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的 # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。 write_path = path_search_order[2] # Path.home() / "avdc.ini" - with open(write_path, 'w', encoding='utf-8') as wcfg: - wcfg.write(res_path.read_text(encoding='utf-8')) + write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8') print("Config file '{}' created.".format(write_path.resolve())) input("Press Enter key exit...") sys.exit(0) From 24b4f9f5e25a04b6b7e27f5e5f9ab8b03ab0bae5 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Mon, 18 Oct 2021 10:51:32 +0800 Subject: [PATCH 40/56] =?UTF-8?q?=E5=B0=86=E5=85=83=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E7=9A=84=E6=9D=A5=E6=BA=90=E7=BD=91=E7=AB=99=E8=AE=B0=E5=85=A5?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E4=BB=A5=E4=BE=BF=E8=BF=9B=E8=A1=8C=E8=AF=84?= =?UTF-8?q?=E4=BC=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ADC_function.py | 4 ++-- WebCrawler/__init__.py | 2 ++ WebCrawler/storyline.py | 2 +- config.py | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index e43fe5f..5b1d507 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,6 +1,6 @@ from os import replace import requests -import hashlib +#import hashlib from pathlib import Path import secrets import os.path @@ -20,7 +20,7 @@ def getXpathSingle(htmlcode, xpath): return result1 -G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36' +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' # 网页请求核心 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index c5d02b5..b6e7b2f 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -115,6 +115,7 @@ def get_data_from_json(file_number): # 从JSON返回元数据 json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) # if any service return a valid return, break if get_data_state(json_data): + print(f"[+]Find movie [{file_number}] metadata on website '{source}'") break pool.close() pool.terminate() @@ -126,6 +127,7 @@ def get_data_from_json(file_number): # 从JSON返回元数据 json_data = json.loads(func_mapping[source](file_number)) # if any service return a valid return, break if get_data_state(json_data): + print(f"[+]Find movie [{file_number}] metadata on website '{source}'") break except: break diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index b32778d..5ad4fd7 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -34,7 +34,7 @@ def getStoryline(number, title): for i in range(cnt): sl = len(result[i])if isinstance(result[i], str) else 0 if sl and first: - s += f',[选中结果{apply_sites[i]}字数:{sl}]' + s += f',[选中{apply_sites[i]}字数:{sl}]' first = False sel = result[i] elif sl: diff --git a/config.py b/config.py index 5624e85..abe030e 100644 --- a/config.py +++ b/config.py @@ -394,10 +394,10 @@ if __name__ == "__main__": code = compile(evstr, "<string>", "eval") print('{}: "{}"'.format(evstr, eval(code))) config = Config() - mfilter = ('conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path') + mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'} for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]: evprint(f'config.{_m}()') - pfilter = ('proxies', 'SUPPORT_PROXY_TYPE') + pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'} # test getInstance() assert(getInstance() == config) for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]: From f5539279136ac1148f19e1e71fbde99d9158881d Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Mon, 18 Oct 2021 17:58:21 +0800 Subject: [PATCH 41/56] =?UTF-8?q?=E6=8F=90=E9=80=9F=EF=BC=8C=E6=9A=82?= =?UTF-8?q?=E6=97=B6=E5=B1=8F=E8=94=BD=E6=9C=AA=E5=AE=9E=E7=8E=B0=E7=9A=84?= =?UTF-8?q?=E6=BC=94=E5=91=98=E7=85=A7=E7=89=87=E5=8A=9F=E8=83=BD=20javdb?= =?UTF-8?q?=20javbus?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javbus.py | 2 +- WebCrawler/javdb.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index c9d53f3..0959e1e 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -182,7 +182,7 @@ def main(number): 'tag': getTag(htmlcode), 'extrafanart': getExtrafanart(htmlcode), 'label': getSerise(htmlcode), - 'actor_photo': getActorPhoto(htmlcode), +# 'actor_photo': getActorPhoto(htmlcode), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', 'series': getSerise(htmlcode), diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 241de49..34cfc32 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -319,7 +319,7 @@ def main(number): 'tag': getTag(detail_page), 'label': getLabel(detail_page), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(detail_page), +# 'actor_photo': getActorPhoto(detail_page), 'website': 'https://javdb.com' + correct_url, 'source': 'javdb.py', 'series': getSeries(detail_page), From 5ef16e3a6de5c3bb9381a6ceb9fe4b71619bcd81 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Mon, 18 Oct 2021 18:09:36 +0800 Subject: [PATCH 42/56] =?UTF-8?q?=E5=89=A7=E6=83=85=E7=AE=80=E4=BB=8B?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=BF=90=E8=A1=8C=E6=A8=A1=E5=BC=8Frun=5Fmod?= =?UTF-8?q?e,=200:=E9=A1=BA=E5=BA=8F=E6=89=A7=E8=A1=8C=201:=E7=BA=BF?= =?UTF-8?q?=E7=A8=8B=E6=B1=A0=202:=E8=BF=9B=E7=A8=8B=E6=B1=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/storyline.py | 23 +++++++++++++++++++---- config.ini | 3 +++ config.py | 7 +++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 5ad4fd7..d9da869 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -2,13 +2,25 @@ import sys sys.path.append('../') import re import json +import builtins from ADC_function import * from multiprocessing import Pool +from multiprocessing.dummy import Pool as ThreadPool from difflib import SequenceMatcher from unicodedata import category G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"} +G_mode_txt = ('顺序执行','线程池','进程池') + +class noThread(object): + def map(self, fn, param): + return builtins.map(fn, param) + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + pass + # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 def getStoryline(number, title): @@ -18,9 +30,12 @@ def getStoryline(number, title): storyine_sites = conf.storyline_site().split(',') apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] mp_args = ((site, number, title, debug) for site in apply_sites) - # choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/ - with Pool() as proc_pool: - result = proc_pool.map(getStoryline_mp, mp_args) + cores = min(len(apply_sites), os.cpu_count()) + run_mode = conf.storyline_mode() + assert run_mode in (0,1,2) + with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool: + result = pool.map(getStoryline_mp, mp_args) + result = list(result) if run_mode == 0 else result if not debug and conf.storyline_show() == 0: for value in result: if isinstance(value, str) and len(value): @@ -28,7 +43,7 @@ def getStoryline(number, title): return '' # 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示 cnt = len(apply_sites) - s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' + s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' first = True sel = '' for i in range(cnt): diff --git a/config.ini b/config.ini index 5125ad3..700fa95 100755 --- a/config.ini +++ b/config.ini @@ -92,5 +92,8 @@ extrafanart_folder=extrafanart ; 于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。 ; site= site=airav,avno1,xcity,amazon +; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) +run_mode=1 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 show_result=0 + diff --git a/config.py b/config.py index abe030e..83a36bc 100644 --- a/config.py +++ b/config.py @@ -252,6 +252,12 @@ class Config: except: return 0 + def storyline_mode(self) -> int: + try: + v = self.conf.getint("storyline", "run_mode") + return v if v in (0,1,2) else 2 if v > 2 else 0 + except: + return 1 @staticmethod def _exit(sec: str) -> None: @@ -350,6 +356,7 @@ class Config: conf.add_section(sec14) conf.set(sec14, "site", "airav,avno1,xcity,amazon") conf.set(sec14, "show_result", 0) + conf.set(sec14, "run_mode", 1) return conf From 4428971135149749bc74c286ae18a0eb551570bc Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Mon, 18 Oct 2021 19:52:42 +0800 Subject: [PATCH 43/56] =?UTF-8?q?javdb.py:=20=E4=BC=98=E5=8C=96=EF=BC=8C?= =?UTF-8?q?=E4=BF=AE=E7=90=86getActorPhoto()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javdb.py | 103 +++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 34cfc32..e4e803c 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -9,13 +9,11 @@ from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getTitle(html): browser_title = str(html.xpath("/html/head/title/text()")[0]) return browser_title[:browser_title.find(' | JavDB')].strip() -def getActor(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getActor(html): actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') r = [] @@ -32,8 +30,8 @@ def getActor(a): idx = idx + 1 return r -def getaphoto(url): - html_page = get_html(url) +def getaphoto(url, browser): + html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url) img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)') img_url = img_prether.findall(html_page) if img_url: @@ -41,24 +39,18 @@ def getaphoto(url): else: return '' -def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img - actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>') - actorall = actorall_prether.findall(html) - - if actorall: - actoralls = actorall[0] - actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>') - actor = actor_prether.findall(actoralls) - actor_photo = {} - for i in actor: - actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0]) - - return actor_photo - - else: +def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img + actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]') + if not actorall: return {} + a = getActor(html) + actor_photo = {} + for i in actorall: + if i.text in a: + actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser) + return actor_photo -def getStudio(a): +def getStudio(a, html): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") @@ -70,25 +62,21 @@ def getStudio(a): if len(result): return result # 以卖家作为工作室 - html = etree.fromstring(a, etree.HTMLParser()) try: result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") except: result = '' return result -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRuntime(html): result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getLabel(html): result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getNum(html): result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') @@ -118,8 +106,7 @@ def getRelease(a): else: result = '' return result -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getTag(html): try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') total = [] @@ -140,11 +127,10 @@ def getTag(a): pass return total -def getCover_small(a, index=0): +def getCover_small(html, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: @@ -175,23 +161,20 @@ def getTrailer(htmlcode): # 获取预告片 video_url = '' return video_url -def getExtrafanart(htmlcode): # 获取剧照 - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getExtrafanart(html): # 获取剧照 result = [] try: result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href") except: pass return result -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover(html): try: result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] except: # 2020.7.17 Repair Cover Url crawl result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] return result -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getDirector(html): result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') @@ -206,9 +189,7 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 return '' def getOutline(number, title): #获取剧情介绍 多进程并发查询 return getStoryline(number,title) -def getSeries(a): - #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getSeries(html): result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') @@ -243,6 +224,7 @@ def main(number): javdb_site = secrets.choice(javdb_sites) if debug: print(f'[!]javdb:select site {javdb_site}') + browser = None try: javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser') @@ -277,52 +259,54 @@ def main(number): except: detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) + # etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用 + lx = etree.fromstring(detail_page, etree.HTMLParser()) # no cut image by default imagecut = 3 # If gray image exists ,then replace with normal cover if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): - cover_small = getCover_small(query_result) + cover_small = getCover_small(html) else: try: - cover_small = getCover_small(query_result, index=ids.index(number)) + cover_small = getCover_small(html, index=ids.index(number)) except: # if input number is "STAR438" not "STAR-438", use first search result. - cover_small = getCover_small(query_result) + cover_small = getCover_small(html) if 'placeholder' in cover_small: # replace wit normal cover and cut it imagecut = 1 - cover_small = getCover(detail_page) + cover_small = getCover(lx) - dp_number = getNum(detail_page) + dp_number = getNum(lx) if dp_number.upper() != number: raise ValueError("number not found") - title = getTitle(detail_page) + title = getTitle(lx) if title and dp_number: number = dp_number # remove duplicate title title = title.replace(number, '').strip() dic = { - 'actor': getActor(detail_page), + 'actor': getActor(lx), 'title': title, - 'studio': getStudio(detail_page), + 'studio': getStudio(detail_page, lx), 'outline': getOutline(number, title), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), + 'runtime': getRuntime(lx), + 'director': getDirector(lx), 'release': getRelease(detail_page), 'number': number, - 'cover': getCover(detail_page), + 'cover': getCover(lx), 'cover_small': cover_small, 'trailer': getTrailer(detail_page), - 'extrafanart': getExtrafanart(detail_page), + 'extrafanart': getExtrafanart(lx), 'imagecut': imagecut, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), + 'tag': getTag(lx), + 'label': getLabel(lx), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), -# 'actor_photo': getActorPhoto(detail_page), +# 'actor_photo': getActorPhoto(lx, javdb_site, browser), 'website': 'https://javdb.com' + correct_url, 'source': 'javdb.py', - 'series': getSeries(detail_page), + 'series': getSeries(lx), } if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): @@ -356,4 +340,5 @@ if __name__ == "__main__": # print(main('EHM0001')) # not found # print(main('FC2-2314275')) # print(main('EBOD-646')) - print(main('LOVE-262')) + # print(main('LOVE-262')) + print(main('ABP-890')) From dd106453f76e19c40b31bf7559f607559c3f4bd0 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 00:03:51 +0800 Subject: [PATCH 44/56] =?UTF-8?q?=E5=AF=B9=E6=A0=87=E8=AE=B0=E4=B8=BA?= =?UTF-8?q?=E5=88=A0=E9=99=A4=E7=9A=84tag=E8=BF=9B=E8=A1=8C=E6=B8=85?= =?UTF-8?q?=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index b6e7b2f..44f9094 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -178,6 +178,10 @@ def get_data_from_json(file_number): # 从JSON返回元数据 imagecut = json_data.get('imagecut') tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ + while 'XXXX' in tag: + tag.remove('XXXX') + while 'xxx' in tag: + tag.remove('xxx') actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') if title == '' or number == '': @@ -306,4 +310,5 @@ def special_characters_replacement(text) -> str: replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane - replace('|', 'ǀ')) # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('&', '&')) From d80b2eeb7d344b69a22fcec61380bcf30c7b8bd4 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 00:14:26 +0800 Subject: [PATCH 45/56] =?UTF-8?q?javbus.py:=20=E4=BC=98=E5=8C=96=EF=BC=8C?= =?UTF-8?q?=E4=BF=AE=E7=90=86=E6=97=A0=E7=A0=81=E7=89=87=E7=9A=84=E5=AF=BC?= =?UTF-8?q?=E6=BC=94=E3=80=81=E7=B3=BB=E5=88=97=E7=AD=89=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javbus.py | 187 ++++++++++++++++++------------------------- 1 file changed, 80 insertions(+), 107 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 0959e1e..f17a1ab 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -3,81 +3,61 @@ sys.path.append('../') import re from pyquery import PyQuery as pq#need install from lxml import etree#need install -from bs4 import BeautifulSoup#need install import json from ADC_function import * from WebCrawler.storyline import getStoryline import inspect -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) +def getActorPhoto(doc): #//*[@id="star_qdt"]/li/a/img + actors = doc('div.star-name a').items() d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) + for i in actors: + url=i.attr.href + t=i.attr.title + html = etree.fromstring(get_html(url), etree.HTMLParser()) p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) return d -def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - title=str(doc('div.container h3').text()).replace(' ','-') - try: - title2 = re.sub('n\d+-','',title) - return title2 - except: - return title -def getStudio(htmlcode): #获取厂商 已修改 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - # 如果记录中冇导演,厂商排在第4位 - if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - # 如果记录中有导演,厂商排在第5位 - elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - else: - result = '' - return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) +def getTitle(html): #获取标题 + title = str(html.xpath('/html/head/title/text()')[0]) + title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip() + return title +def getStudioJa(html): + x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getStudio(html): #获取厂商 + x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getYear(html): #获取年份 + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip() + return result[:4] if len(result)>=len('2000-01-01') else '' +def getCover(doc): #获取封面链接 image = doc('a.bigImage') return urljoin("https://www.javbus.com", image.attr('href')) -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getRelease(html): #获取出版日期 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") return result -def getRuntime(htmlcode): #获取分钟 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getRuntime(html): #获取分钟 已修改 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") return result -def getActor(htmlcode): #获取女优 +def getActor(doc): #获取女优 b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) + actors = doc('div.star-name a').items() + for i in actors: + b.append(i.attr.title) return b -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - else: - result = '' # 记录中有可能没有导演数据 - return result -def getCID(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - #print(htmlcode) +def getNum(html): #获取番号 + kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return kwdlist[0] +def getDirectorJa(html): + x = html.xpath('//span[contains(text(),"監督:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getDirector(html): #获取导演 + x = html.xpath('//span[contains(text(),"導演:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getCID(html): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result @@ -94,27 +74,16 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 return '' def getOutline(number, title): #获取剧情介绍 多进程并发查询 return getStoryline(number,title) -def getSerise(htmlcode): #获取系列 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - # 如果记录中冇导演,系列排在第6位 - if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") - # 如果记录中有导演,系列排在第7位 - elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - else: - result = '' - return result -def getTag(htmlcode): # 获取标签 - tag = [] - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - for i in a: - if 'onmouseout' in str(i) or '多選提交' in str(i): - continue - tag.append(translateTag_to_sc(i.get_text())) - return tag - +def getSeriseJa(html): + x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getSerise(html): #获取系列 + x = html.xpath('//span[contains(text(),"系列:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getTag(html): # 获取标签 + klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + taglist = [translateTag_to_sc(v) for v in klist[1:]] + return taglist def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>') html = html_pather.search(htmlcode) @@ -128,30 +97,30 @@ def getExtrafanart(htmlcode): # 获取剧照 def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number) - if getTitle(htmlcode) == '': - htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') - title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','') + doc = pq(htmlcode) + lx = etree.fromstring(htmlcode, etree.HTMLParser()) + title = getTitle(lx) dic = { 'title': title, - 'studio': getStudio(htmlcode), - 'year': getYear(htmlcode), + 'studio': getStudioJa(lx), + 'year': getYear(lx), 'outline': getOutline(number, title), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), - 'tag': getTag(htmlcode), + 'runtime': getRuntime(lx), + 'director': getDirectorJa(lx), + 'actor': getActor(doc), + 'release': getRelease(lx), + 'number': getNum(lx), + 'cover': getCover(doc), + 'tag': getTag(lx), 'extrafanart': getExtrafanart(htmlcode), - 'label': getSerise(htmlcode), + 'label': getSeriseJa(lx), 'imagecut': 0, - 'actor_photo': '', +# 'actor_photo': '', 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', - 'series': getSerise(htmlcode), + 'series': getSeriseJa(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js @@ -166,26 +135,28 @@ def main(number): htmlcode = get_html('https://www.javbus.com/' + number) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') - title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode))) + doc = pq(htmlcode) + lx = etree.fromstring(htmlcode,etree.HTMLParser()) + title = getTitle(lx) dic = { 'title': title, - 'studio': getStudio(htmlcode), - 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), + 'studio': getStudio(lx), + 'year': getYear(lx), 'outline': getOutline(number, title), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), + 'runtime': getRuntime(lx), + 'director': getDirector(lx), + 'actor': getActor(doc), + 'release': getRelease(lx), + 'number': getNum(lx), + 'cover': getCover(doc), 'imagecut': 1, - 'tag': getTag(htmlcode), + 'tag': getTag(lx), 'extrafanart': getExtrafanart(htmlcode), - 'label': getSerise(htmlcode), -# 'actor_photo': getActorPhoto(htmlcode), + 'label': getSerise(lx), +# 'actor_photo': getActorPhoto(doc), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', - 'series': getSerise(htmlcode), + 'series': getSerise(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') return js @@ -206,8 +177,10 @@ if __name__ == "__main__" : config.G_conf_override['debug_mode:switch'] = True print(main('ABP-888')) print(main('ABP-960')) - # print(main('ADV-R0624')) # 404 - # print(main('MMNT-010')) + print(main('ADV-R0624')) # 404 + print(main('MMNT-010')) print(main('ipx-292')) print(main('CEMD-011')) print(main('CJOD-278')) + print(main('100221_001')) + print(main('AVSW-061')) From 5da134986a674ac5d498207eacd1063cf69f4544 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 00:17:45 +0800 Subject: [PATCH 46/56] storyline.py: bug fix --- WebCrawler/storyline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index d9da869..693f404 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -31,6 +31,8 @@ def getStoryline(number, title): apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] mp_args = ((site, number, title, debug) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count()) + if cores == 0: + return '' run_mode = conf.storyline_mode() assert run_mode in (0,1,2) with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool: From 249884a27e6d7496bfb0944b8421f9c1c2c71e31 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 00:58:28 +0800 Subject: [PATCH 47/56] =?UTF-8?q?javbus.py:=20=E4=BC=98=E5=8C=96=E6=8F=90?= =?UTF-8?q?=E9=80=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javbus.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index f17a1ab..63457bf 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -1,19 +1,18 @@ import sys sys.path.append('../') import re -from pyquery import PyQuery as pq#need install from lxml import etree#need install import json from ADC_function import * from WebCrawler.storyline import getStoryline import inspect -def getActorPhoto(doc): #//*[@id="star_qdt"]/li/a/img - actors = doc('div.star-name a').items() +def getActorPhoto(html): + actors = html.xpath('//div[@class="star-name"]/a') d={} for i in actors: - url=i.attr.href - t=i.attr.title + url=i.attrib['href'] + t=i.attrib['title'] html = etree.fromstring(get_html(url), etree.HTMLParser()) p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) @@ -33,20 +32,20 @@ def getStudio(html): #获取厂商 def getYear(html): #获取年份 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip() return result[:4] if len(result)>=len('2000-01-01') else '' -def getCover(doc): #获取封面链接 - image = doc('a.bigImage') - return urljoin("https://www.javbus.com", image.attr('href')) +def getCover(html): #获取封面链接 + image = str(html.xpath('//a[@class="bigImage"]/@href')[0]) + return urljoin("https://www.javbus.com", image) def getRelease(html): #获取出版日期 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") return result def getRuntime(html): #获取分钟 已修改 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") return result -def getActor(doc): #获取女优 +def getActor(html): #获取女优 b=[] - actors = doc('div.star-name a').items() + actors = html.xpath('//div[@class="star-name"]/a') for i in actors: - b.append(i.attr.title) + b.append(i.attrib['title']) return b def getNum(html): #获取番号 kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') @@ -99,7 +98,6 @@ def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') - doc = pq(htmlcode) lx = etree.fromstring(htmlcode, etree.HTMLParser()) title = getTitle(lx) dic = { @@ -109,10 +107,10 @@ def main_uncensored(number): 'outline': getOutline(number, title), 'runtime': getRuntime(lx), 'director': getDirectorJa(lx), - 'actor': getActor(doc), + 'actor': getActor(lx), 'release': getRelease(lx), 'number': getNum(lx), - 'cover': getCover(doc), + 'cover': getCover(lx), 'tag': getTag(lx), 'extrafanart': getExtrafanart(htmlcode), 'label': getSeriseJa(lx), @@ -135,7 +133,6 @@ def main(number): htmlcode = get_html('https://www.javbus.com/' + number) if "<title>404 Page Not Found" in htmlcode: raise Exception('404 page not found') - doc = pq(htmlcode) lx = etree.fromstring(htmlcode,etree.HTMLParser()) title = getTitle(lx) dic = { @@ -145,15 +142,15 @@ def main(number): 'outline': getOutline(number, title), 'runtime': getRuntime(lx), 'director': getDirector(lx), - 'actor': getActor(doc), + 'actor': getActor(lx), 'release': getRelease(lx), 'number': getNum(lx), - 'cover': getCover(doc), + 'cover': getCover(lx), 'imagecut': 1, 'tag': getTag(lx), 'extrafanart': getExtrafanart(htmlcode), 'label': getSerise(lx), -# 'actor_photo': getActorPhoto(doc), +# 'actor_photo': getActorPhoto(lx), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', 'series': getSerise(lx), From aae4df73fae2dd0d8788dbd2e8f491fd90d9447c Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 01:00:50 +0800 Subject: [PATCH 48/56] =?UTF-8?q?javbus.py:=20=E6=B8=85=E7=90=86=E8=BF=87?= =?UTF-8?q?=E6=9C=9F=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/javbus.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 63457bf..7866052 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -5,7 +5,6 @@ from lxml import etree#need install import json from ADC_function import * from WebCrawler.storyline import getStoryline -import inspect def getActorPhoto(html): actors = html.xpath('//div[@class="star-name"]/a') @@ -60,17 +59,6 @@ def getCID(html): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result -def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 - if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): - return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 - try: - htmlcode = get_html('https://cn.airav.wiki/video/' + number) - from WebCrawler.airav import getOutline as airav_getOutline - result = airav_getOutline(htmlcode) - return result - except: - pass - return '' def getOutline(number, title): #获取剧情介绍 多进程并发查询 return getStoryline(number,title) def getSeriseJa(html): From daf7f5e0a0efdc9aada4f0af784a7d90560bcfc0 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 15:14:15 +0800 Subject: [PATCH 49/56] =?UTF-8?q?carib.py:=20=E5=B0=9D=E8=AF=95=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E4=B8=AD=E6=96=87=E5=89=A7=E6=83=85=E4=BB=8B=E7=BB=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/__init__.py | 2 ++ WebCrawler/carib.py | 38 +++++++++++++++++++++++++------------- WebCrawler/storyline.py | 4 ++-- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index 44f9094..039fed0 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str: replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK + replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK replace('&', '&')) diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index c1a25d9..3e583df 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -4,26 +4,29 @@ import json from lxml import html import re from ADC_function import * +from WebCrawler.storyline import getStoryline def main(number: str) -> json: try: - carib_obj, browser = get_html_by_browser( - 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', - return_type="browser") - - if not carib_obj or not carib_obj.ok: + # 因演员图片功能还未使用,为提速暂时注释,改为用get_html() + #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html', + # return_type='browser') + #if not r.ok: + # raise ValueError("page not found") + #htmlcode = str(browser.page) + htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content') + htmlcode = htmlbyte.decode('euc-jp') + if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode: raise ValueError("page not found") - lx = html.fromstring(str(browser.page)) - - if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): - raise ValueError("page info not found") + lx = html.fromstring(htmlcode) + title = get_title(lx) dic = { - 'title': get_title(lx), + 'title': title, 'studio': '加勒比', 'year': get_year(lx), - 'outline': get_outline(lx), + 'outline': get_outline(lx, number, title), 'runtime': get_runtime(lx), 'director': '', 'actor': get_actor(lx), @@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] -def get_outline(lx: html.HtmlElement) -> str: - return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() +def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: + o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() + + storyline_site = config.getInstance().storyline_site().split(',') + a = set(storyline_site) & {'airav', 'avno1'} + if len(a): + site = [n for n in storyline_site if n in a] + g = getStoryline(number, title, site) + if len(g): + return g + return o def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 693f404..5c2b91a 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -23,11 +23,11 @@ class noThread(object): # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 -def getStoryline(number, title): +def getStoryline(number, title, sites: list=None): start_time = time.time() conf = config.getInstance() debug = conf.debug() or conf.storyline_show() == 2 - storyine_sites = conf.storyline_site().split(',') + storyine_sites = conf.storyline_site().split(',') if sites is None else sites apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] mp_args = ((site, number, title, debug) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count()) From 8559eea29652db4dcf79cd0f741e0b2de7d009bb Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 15:18:39 +0800 Subject: [PATCH 50/56] =?UTF-8?q?avsox.py:=20=E5=85=83=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=89=A7=E6=83=85=E4=BB=8B=E7=BB=8D=E3=80=82?= =?UTF-8?q?=E4=BC=98=E5=8C=96:=E5=87=8F=E5=B0=91etree.fromstring=E9=AB=98?= =?UTF-8?q?=E5=BC=80=E9=94=80=E8=B0=83=E7=94=A8=E6=AC=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/avsox.py | 53 +++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 293769a..a353690 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -5,12 +5,11 @@ from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * -# import sys +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') +def getActorPhoto(soup): a = soup.find_all(attrs={'class': 'avatar-box'}) d = {} for i in a: @@ -19,34 +18,28 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img p2 = {t: l} d.update(p2) return d -def getTitle(a): +def getTitle(html): try: - html = etree.fromstring(a, etree.HTMLParser()) result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] return result.replace('/', '') except: return '' -def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - soup = BeautifulSoup(a, 'lxml') +def getActor(soup): a = soup.find_all(attrs={'class': 'avatar-box'}) d = [] for i in a: d.append(i.span.get_text()) return d -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getStudio(html): result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') return result1 -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRuntime(html): result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") return result1 -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getLabel(html): result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") return result1 -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getNum(html): result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") return result1 def getYear(release): @@ -55,28 +48,23 @@ def getYear(release): return result except: return release -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRelease(html): result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") return result1 -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover(html): result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") return result -def getCover_small(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover_small(html): result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result -def getTag(a): # 获取演员 - soup = BeautifulSoup(a, 'lxml') +def getTag(soup): # 获取演员 a = soup.find_all(attrs={'class': 'genre'}) d = [] for i in a: d.append(i.get_text()) return d -def getSeries(htmlcode): +def getSeries(html): try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") return result1 except: @@ -98,27 +86,30 @@ def main(number): result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") web = get_html("https:" + result1) soup = BeautifulSoup(web, 'lxml') + web = etree.fromstring(web, etree.HTMLParser()) info = str(soup.find(attrs={'class': 'row movie'})) + info = etree.fromstring(info, etree.HTMLParser()) try: new_number = getNum(info) if new_number.upper() != number.upper(): raise ValueError('number not found') + title = getTitle(web).strip(getNum(web)) dic = { - 'actor': getActor(web), - 'title': getTitle(web).strip(getNum(web)), + 'actor': getActor(soup), + 'title': title, 'studio': getStudio(info), - 'outline': '', # + 'outline': getStoryline(number, title), 'runtime': getRuntime(info), 'director': '', # 'release': getRelease(info), 'number': new_number, 'cover': getCover(web), - 'cover_small': getCover_small(a), + 'cover_small': getCover_small(html), 'imagecut': 3, - 'tag': getTag(web), + 'tag': getTag(soup), 'label': getLabel(info), 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(web), + 'actor_photo': getActorPhoto(soup), 'website': "https:" + result1, 'source': 'avsox.py', 'series': getSeries(info), From c3e9ab795735df91eb23d4d28f5f66e21d2bd079 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 17:08:00 +0800 Subject: [PATCH 51/56] =?UTF-8?q?avsox.py:=20=E4=BC=98=E5=8C=96:=E5=AE=8C?= =?UTF-8?q?=E6=88=90=E7=B2=BE=E7=AE=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/avsox.py | 61 ++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index a353690..e38a452 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -3,18 +3,17 @@ sys.path.append('..') import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getActorPhoto(soup): - a = soup.find_all(attrs={'class': 'avatar-box'}) +def getActorPhoto(html): + a = html.xpath('//a[@class="avatar-box"]') d = {} for i in a: - l = i.img['src'] - t = i.span.get_text() + l = i.find('.//img').attrib['src'] + t = i.find('span').text p2 = {t: l} d.update(p2) return d @@ -24,11 +23,11 @@ def getTitle(html): return result.replace('/', '') except: return '' -def getActor(soup): - a = soup.find_all(attrs={'class': 'avatar-box'}) +def getActor(html): + a = html.xpath('//a[@class="avatar-box"]') d = [] for i in a: - d.append(i.span.get_text()) + d.append(i.find('span').text) return d def getStudio(html): result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') @@ -57,12 +56,9 @@ def getCover(html): def getCover_small(html): result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result -def getTag(soup): # 获取演员 - a = soup.find_all(attrs={'class': 'genre'}) - d = [] - for i in a: - d.append(i.get_text()) - return d +def getTag(html): + x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return x[2:] if len(x) > 2 else [] def getSeries(html): try: result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") @@ -74,45 +70,42 @@ def main(number): html = get_html('https://tellme.pw/avsox') site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0] a = get_html(site + '/cn/search/' + number) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html(site + '/cn/search/' + number.replace('-', '_')) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html(site + '/cn/search/' + number.replace('_', '')) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - web = get_html("https:" + result1) - soup = BeautifulSoup(web, 'lxml') - web = etree.fromstring(web, etree.HTMLParser()) - info = str(soup.find(attrs={'class': 'row movie'})) - info = etree.fromstring(info, etree.HTMLParser()) + detail = get_html("https:" + result1) + lx = etree.fromstring(detail, etree.HTMLParser()) try: - new_number = getNum(info) + new_number = getNum(lx) if new_number.upper() != number.upper(): raise ValueError('number not found') - title = getTitle(web).strip(getNum(web)) + title = getTitle(lx).strip(new_number) dic = { - 'actor': getActor(soup), + 'actor': getActor(lx), 'title': title, - 'studio': getStudio(info), + 'studio': getStudio(lx), 'outline': getStoryline(number, title), - 'runtime': getRuntime(info), + 'runtime': getRuntime(lx), 'director': '', # - 'release': getRelease(info), + 'release': getRelease(lx), 'number': new_number, - 'cover': getCover(web), + 'cover': getCover(lx), 'cover_small': getCover_small(html), 'imagecut': 3, - 'tag': getTag(soup), - 'label': getLabel(info), - 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(soup), + 'tag': getTag(lx), + 'label': getLabel(lx), + 'year': getYear(getRelease(lx)), + 'actor_photo': getActorPhoto(lx), 'website': "https:" + result1, 'source': 'avsox.py', - 'series': getSeries(info), + 'series': getSeries(lx), } except Exception as e: if config.getInstance().debug(): From b025c5185270f44672ef43c9aad149459c72aeab Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Tue, 19 Oct 2021 18:40:57 +0800 Subject: [PATCH 52/56] =?UTF-8?q?xcity.py:=E5=B0=9D=E8=AF=95=E8=8E=B7?= =?UTF-8?q?=E5=BE=97=E4=B8=AD=E6=96=87=E5=89=A7=E6=83=85=E7=AE=80=E4=BB=8B?= =?UTF-8?q?=EF=BC=8C=E6=B2=A1=E6=9C=89=E5=88=99=E7=94=A8=E5=8E=9F=E6=9D=A5?= =?UTF-8?q?=E7=9A=84=E3=80=82=E4=BF=AE=E5=A4=8Dtag=E6=95=B0=E7=9B=AE?= =?UTF-8?q?=E4=B8=8D=E5=AF=B9=EF=BC=8C=E4=BF=AE=E5=A4=8Druntime=E4=B8=8D?= =?UTF-8?q?=E6=98=BE=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/xcity.py | 99 ++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 59 deletions(-) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 4bbdec1..6eb208d 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -3,16 +3,12 @@ sys.path.append('../') import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * - - -# import sys +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getTitle(html): result = html.xpath('//*[@id="program_detail_title"]/text()')[0] return result @@ -43,8 +39,7 @@ def getActorPhoto(browser): return o -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getStudio(html): try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") except: @@ -52,20 +47,14 @@ def getStudio(a): return result.strip('+').replace("', '", '').replace('"', '') -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRuntime(html): try: - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0] - except: - return '' - try: - return re.findall('\d+',result1)[0] + x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip() + return x except: return '' - -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getLabel(html): try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] return result @@ -73,8 +62,7 @@ def getLabel(a): return '' -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getNum(html): try: result = html.xpath('//*[@id="hinban"]/text()')[0] return result @@ -90,8 +78,7 @@ def getYear(getRelease): return getRelease -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRelease(html): try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) except: @@ -102,31 +89,22 @@ def getRelease(a): return '' -def getTag(a): - result2=[] - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()') - for i in result1: - i=i.replace(u'\n','') - i=i.replace(u'\t','') - if len(i): - result2.append(i) - return result2 +def getTag(html): + x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()') + return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else [] -def getCover_small(a, index=0): +def getCover_small(html, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover(html): try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] return 'https:' + result @@ -134,8 +112,7 @@ def getCover(htmlcode): return '' -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getDirector(html): try: result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') return result @@ -143,19 +120,21 @@ def getDirector(a): return '' -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getOutline(html, number, title): + storyline_site = config.getInstance().storyline_site().split(',') + a = set(storyline_site) & {'airav', 'avno1'} + if len(a): + site = [n for n in storyline_site if n in a] + g = getStoryline(number, title, site) + if len(g): + return g try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0] + x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0] + return x.replace(getNum(html), '') except: return '' - try: - return re.sub('\\\\\w*\d+','',result) - except: - return result -def getSeries(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getSeries(html): try: try: result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] @@ -198,33 +177,35 @@ def main(number): try: detail_page, browser = open_by_browser(number) url = browser.url - newnum = getNum(detail_page).upper() + lx = etree.fromstring(detail_page, etree.HTMLParser()) + newnum = getNum(lx).upper() number_up = number.upper() if newnum != number_up: if newnum == number.replace('-','').upper(): newnum = number_up else: raise ValueError("xcity.py: number not found") + title = getTitle(lx) dic = { 'actor': getActor(browser), - 'title': getTitle(detail_page), - 'studio': getStudio(detail_page), - 'outline': getOutline(detail_page), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), - 'release': getRelease(detail_page), + 'title': title, + 'studio': getStudio(lx), + 'outline': getOutline(lx, number, title), + 'runtime': getRuntime(lx), + 'director': getDirector(lx), + 'release': getRelease(lx), 'number': newnum, - 'cover': getCover(detail_page), + 'cover': getCover(lx), 'cover_small': '', 'extrafanart': getExtrafanart(detail_page), 'imagecut': 1, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), - 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), + 'tag': getTag(lx), + 'label': getLabel(lx), + 'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()), # 'actor_photo': getActorPhoto(browser), 'website': url, 'source': 'xcity.py', - 'series': getSeries(detail_page), + 'series': getSeries(lx), } except Exception as e: if config.getInstance().debug(): From cb83e4246db7ea503ba9f3ef9bccec2274e14ac7 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Wed, 20 Oct 2021 03:34:44 +0800 Subject: [PATCH 53/56] =?UTF-8?q?=E6=97=A0=E7=A0=81=E6=A3=80=E6=B5=8B?= =?UTF-8?q?=E7=A7=BB=E5=85=A5number=5Fparser.py=E5=B9=B6=E6=89=A9=E5=85=85?= =?UTF-8?q?=E8=AF=86=E5=88=AB=E8=83=BD=E5=8A=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ADC_function.py | 11 ----------- config.ini | 3 +-- core.py | 2 +- number_parser.py | 34 +++++++++++++++++++++++++++++++++- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 5b1d507..12fecce 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -566,17 +566,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t return trans_result -# ========================================================================是否为无码 -def is_uncensored(number): - if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper(): - return True - configs = config.getInstance().get_uncensored() - prefix_list = str(configs).split(',') - for pre in prefix_list: - if pre.upper() in number.upper(): - return True - return False - # 从浏览器中导出网站登录验证信息的cookies,能够以会员方式打开游客无法访问到的页面 # 示例: FC2-755670 url https://javdb9.com/v/vO8Mn # json 文件格式 diff --git a/config.ini b/config.ini index 700fa95..b4d9fb4 100755 --- a/config.ini +++ b/config.ini @@ -65,8 +65,7 @@ switch=0 ; 用来确定是否是无码 [uncensored] -uncensored_prefix=S2M,BT,LAF,SMD - +uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,RED [media] ; 影片后缀 diff --git a/core.py b/core.py index ae73af8..d7066f4 100755 --- a/core.py +++ b/core.py @@ -12,7 +12,7 @@ from datetime import datetime from ADC_function import * from WebCrawler import get_data_from_json - +from number_parser import is_uncensored def escape_path(path, escape_literals: str): # Remove escape literals backslash = '\\' diff --git a/number_parser.py b/number_parser.py index 616af85..212c2c0 100755 --- a/number_parser.py +++ b/number_parser.py @@ -1,6 +1,7 @@ import os import re import sys +import config G_spat = re.compile( "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@", @@ -82,6 +83,37 @@ def get_number_by_dict(filename: str) -> str: pass return None +class Cache_uncensored_conf: + prefix = None + def is_empty(self): + return bool(self.prefix is None) + def set(self, v: list): + if not v or not len(v) or not len(v[0]): + raise ValueError('input prefix list empty or None') + s = v[0] + if len(v) > 1: + for i in v[1:]: + s += f"|{i}.+" + self.prefix = re.compile(s, re.I) + def check(self, number): + if self.prefix is None: + raise ValueError('No init re compile') + return self.prefix.match(number) + +G_cache_uncensored_conf = Cache_uncensored_conf() + +# ========================================================================是否为无码 +def is_uncensored(number): + if re.match( +r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}', + number, + re.I + ): + return True + if G_cache_uncensored_conf.is_empty(): + G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(',')) + return G_cache_uncensored_conf.check(number) + if __name__ == "__main__": # import doctest # doctest.testmod(raise_on_error=True) @@ -164,7 +196,7 @@ if __name__ == "__main__": try: n = get_number(True, filename) if n: - print(f' [{n}] # {filename}') + print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else '')) else: print(f'[-]Number return None. # {filename}') except Exception as e: From c44031548809cb1a53cc5e10c81adcd5c500a3e3 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Wed, 20 Oct 2021 23:07:37 +0800 Subject: [PATCH 54/56] =?UTF-8?q?=E7=BF=BB=E8=AF=91=E5=89=8D=E6=A3=80?= =?UTF-8?q?=E6=9F=A5=E8=AF=AD=E8=A8=80=EF=BC=8C=E5=B7=B2=E7=BB=8F=E6=98=AF?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=BA=86=E4=B8=8D=E5=BF=85=E7=BF=BB=E8=AF=91?= =?UTF-8?q?=EF=BC=8C=E5=8F=AA=E7=BF=BB=E8=AF=91=E6=97=A5=E8=AF=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ADC_function.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ADC_function.py b/ADC_function.py index 12fecce..36be657 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -511,6 +511,9 @@ def translate( delay: int = 0, ): trans_result = "" + # 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的 + if not is_japanese(src): + return src if engine == "google-free": gsite = config.getInstance().get_translate_service_site() if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite): @@ -620,3 +623,7 @@ def file_modification_days(filename) -> int: def file_not_exist_or_empty(filepath) -> bool: return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0 + +# 日语简单检测 +def is_japanese(s) -> bool: + return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE)) From 1f9bf6b4c283c804252d2ebd8e0c9fc123d5dde5 Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Thu, 21 Oct 2021 19:57:09 +0800 Subject: [PATCH 55/56] =?UTF-8?q?=E6=97=A5=E5=BF=97=E5=90=88=E5=B9=B6:?= =?UTF-8?q?=E4=B8=89=E5=A4=A9=E4=B9=8B=E5=89=8D=E7=9A=84=E6=97=A5=E5=BF=97?= =?UTF-8?q?=EF=BC=8C=E5=90=88=E5=B9=B6=E4=B8=BA=E5=8D=95=E6=97=A5=E5=8D=95?= =?UTF-8?q?=E4=B8=AA=E6=96=87=E4=BB=B6=EF=BC=8C=E4=BB=A5=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E5=A2=9E=E9=87=8F=E5=A4=84=E7=90=86=E6=97=B6=E5=B0=8F=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E8=BF=87=E5=A4=9A=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AV_Data_Capture.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 6c13e5d..e87be03 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -162,14 +162,15 @@ def close_logfile(logdir: str): f.unlink(missing_ok=True) except: pass - # 合并日志 只检测日志目录内的文本日志,忽略子目录。三个月前的日志,按月合并为一个月志, - # 去年及以前的月志,今年4月以后将之按年合并为年志 + # 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志, + # 按月合并为单个月志,去年及以前的月志,今年4月以后将之按年合并为年志 # 测试步骤: """ LOGDIR=/tmp/avlog mkdir -p $LOGDIR for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done + for f in {00..23};do;echo 20211001T$f>$LOGDIR/avdc_20211001T${f}5959.txt;done echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR" # 1932 files in /tmp/avlog avdc -zgic1 -d0 -m3 -o $LOGDIR @@ -177,19 +178,40 @@ def close_logfile(logdir: str): ls $LOGDIR # rm -rf $LOGDIR """ - # 第一步,合并到月 today = datetime.today() + # 第一步,合并到日。3天前的日志,文件名是同一天的合并为一份日志 + for i in range(1): + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}T\d{6}$', f.stem, re.A)] + if not txts or not len(txts): + break + e = [f for f in txts if '_err' in f.stem] + txts.sort() + tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99") + deadline_day = f'avdc_{tmstr_3_days_ago}' + day_merge = [f for f in txts if f.stem < deadline_day] + if not day_merge or not len(day_merge): + break + cutday = len('T235959.txt') # cut length avdc_20201201|T235959.txt + for f in day_merge: + try: + day_file_name = str(f)[:-cutday] + '.txt' # avdc_20201201.txt + with open(day_file_name, 'a', encoding='utf-8') as m: + m.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第二步,合并到月 for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法 - txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{8}T\d{6}', f.stem, re.A)] + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}$', f.stem, re.A)] if not txts or not len(txts): break txts.sort() - tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32T") + tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32") deadline_month = f'avdc_{tmstr_3_month_ago}' month_merge = [f for f in txts if f.stem < deadline_month] if not month_merge or not len(month_merge): break - tomonth = len('01T235959.txt') # cut length avdc_202012|01T235959.txt + tomonth = len('01.txt') # cut length avdc_202012|01.txt for f in month_merge: try: month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt @@ -198,10 +220,10 @@ def close_logfile(logdir: str): f.unlink(missing_ok=True) except: pass - # 第二步,月合并到年 + # 第三步,月合并到年 if today.month < 4: return - mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{6}', f.stem, re.A)] + mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{6}$', f.stem, re.A)] if not mons or not len(mons): return mons.sort() @@ -218,7 +240,7 @@ def close_logfile(logdir: str): f.unlink(missing_ok=True) except: pass - # 第三步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于 + # 第四步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于 # 这种粒度的文本日志,压缩比是目前最好的。lzip -9的运行参数下,日志压缩比要高于xz -9,而且内存占用更少, # 多核利用率更高(plzip多线程版本),解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右, # 100MB的日志文件能缩小到3.7MB。 From 850679705ee6eacaedeb37b2c2405fcf315b947a Mon Sep 17 00:00:00 2001 From: lededev <lededev@noreplay.github.com> Date: Thu, 21 Oct 2021 20:02:07 +0800 Subject: [PATCH 56/56] =?UTF-8?q?=E5=89=A7=E6=83=85=E7=AE=80=E4=BB=8B:?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=97=A0=E7=A0=81=E5=85=83=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E7=AB=99=E7=82=B9=EF=BC=8C=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E9=80=9A=E7=94=A8=E3=80=81=E6=9C=89=E7=A0=81?= =?UTF-8?q?=E3=80=81=E6=97=A0=E7=A0=81=E4=B8=89=E7=A7=8D=E7=AB=99=E7=82=B9?= =?UTF-8?q?=E5=88=86=E5=88=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebCrawler/carib.py | 11 ++----- WebCrawler/storyline.py | 71 +++++++++++++++++++++++++++++++++++------ WebCrawler/xcity.py | 2 +- config.ini | 13 +++++--- config.py | 18 +++++++++-- core.py | 5 +-- number_parser.py | 2 +- 7 files changed, 92 insertions(+), 30 deletions(-) diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index 3e583df..790b910 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -60,14 +60,9 @@ def get_year(lx: html.HtmlElement) -> str: def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() - - storyline_site = config.getInstance().storyline_site().split(',') - a = set(storyline_site) & {'airav', 'avno1'} - if len(a): - site = [n for n in storyline_site if n in a] - g = getStoryline(number, title, site) - if len(g): - return g + g = getStoryline(number, title) + if len(g): + return g return o def get_release(lx: html.HtmlElement) -> str: diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index 5c2b91a..9b0a44c 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -8,8 +8,9 @@ from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool from difflib import SequenceMatcher from unicodedata import category +from number_parser import is_uncensored -G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"} +G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"} G_mode_txt = ('顺序执行','线程池','进程池') @@ -28,7 +29,16 @@ def getStoryline(number, title, sites: list=None): conf = config.getInstance() debug = conf.debug() or conf.storyline_show() == 2 storyine_sites = conf.storyline_site().split(',') if sites is None else sites - apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site] + if is_uncensored(number): + storyine_sites += conf.storyline_uncensored_site().split(',') + else: + storyine_sites += conf.storyline_censored_site().split(',') + r_dup = set() + apply_sites = [] + for s in storyine_sites: + if s in G_registered_storyline_site and s not in r_dup: + apply_sites.append(s) + r_dup.add(s) mp_args = ((site, number, title, debug) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count()) if cores == 0: @@ -80,6 +90,8 @@ def _getStoryline_mp(site, number, title, debug): storyline = getStoryline_xcity(number, debug) elif site == "amazon": storyline = getStoryline_amazon(title, number, debug) + elif site == "58avgo": + storyline = getStoryline_58avgo(number, debug) if not debug: return storyline print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( @@ -119,24 +131,63 @@ def getStoryline_airav(number, debug): return None +def getStoryline_58avgo(number, debug): + try: + url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ + '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', + '?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 + kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number + result, browser = get_html_by_form(url, + fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, + return_type = 'browser') + if not result.ok: + raise ValueError(f"get_html_by_form('{url}','{number}') failed") + if f'searchresults.aspx?Search={kwd}' not in browser.url: + raise ValueError("number not found") + s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click') + link = None + for i in range(len(s)): + title = s[i].h3.text.strip() + if re.search(number, title, re.I): + link = s[i] + break; + if link is None: + raise ValueError("number not found") + result = browser.follow_link(link) + if not result.ok or 'playon.aspx' not in browser.url: + raise ValueError("detail page not found") + title = browser.page.select('head > title')[0].text.strip() + detail_number = str(re.findall('\[(.*?)]', title)[0]) + if not re.search(number, detail_number, re.I): + raise ValueError("detail page number not match, got ->[{detail_number}]") + return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].") + pass + return '' + + def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 try: url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 - number_up = number.upper() result, browser = get_html_by_form(url, form_select='div.wrapper > div.header > div.search > form', - fields = {'kw' : number_up}, + fields = {'kw' : number}, return_type = 'browser') if not result.ok: - raise ValueError(f"get_html_by_form('{url}','{number_up}') failed") - title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip() - page_number = title[title.rfind(' '):].upper() - if not number_up in page_number: - raise ValueError(f"page number ->[{page_number}] not match") - return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() + raise ValueError(f"get_html_by_form('{url}','{number}') failed") + s = browser.page.select('div.type_movie > div > ul > li > div') + for i in range(len(s)): + title = s[i].a.h3.text.strip() + page_number = title[title.rfind(' '):].strip() + if re.search(number, page_number, re.I): + return s[i]['data-description'].strip() + raise ValueError(f"page number ->[{page_number}] not match") except Exception as e: if debug: print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 6eb208d..ed381e7 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -122,7 +122,7 @@ def getDirector(html): def getOutline(html, number, title): storyline_site = config.getInstance().storyline_site().split(',') - a = set(storyline_site) & {'airav', 'avno1'} + a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字 if len(a): site = [n for n in storyline_site if n in a] g = getStoryline(number, title, site) diff --git a/config.ini b/config.ini index b4d9fb4..eef14db 100755 --- a/config.ini +++ b/config.ini @@ -86,11 +86,16 @@ extrafanart_folder=extrafanart ; 剧情简介 [storyline] -; website为javbus或javdb时,site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询,取值优先级 -; 从左到右,靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介,xcity和amazon是日语的,由 -; 于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果列表为空则不查询,设置成不查询可大幅提高刮削速度。 +; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的 +; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。 +; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者 +; 流出破解马赛克的影片(此功能没使用)。 +; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, +; 设置成不查询可大幅提高刮削速度。 ; site= -site=airav,avno1,xcity,amazon +site=avno1 +censored_site=airav,xcity,amazon +uncensored_site=58avgo ; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) run_mode=1 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 diff --git a/config.py b/config.py index 83a36bc..f6d6488 100644 --- a/config.py +++ b/config.py @@ -243,7 +243,19 @@ class Config: try: return self.conf.get("storyline", "site") except: - return "airav,avno1,xcity,amazon" + return "avno1" + + def storyline_censored_site(self) -> str: + try: + return self.conf.get("storyline", "censored_site") + except: + return "airav,xcity,amazon" + + def storyline_uncensored_site(self) -> str: + try: + return self.conf.get("storyline", "uncensored_site") + except: + return "58avgo" def storyline_show(self) -> int: try: @@ -354,7 +366,9 @@ class Config: sec14 = "storyline" conf.add_section(sec14) - conf.set(sec14, "site", "airav,avno1,xcity,amazon") + conf.set(sec14, "site", "avno1") + conf.set(sec14, "censored_site", "airav,xcity,amazon") + conf.set(sec14, "uncensored_site", "58avgo") conf.set(sec14, "show_result", 0) conf.set(sec14, "run_mode", 1) diff --git a/core.py b/core.py index d7066f4..24c1ce5 100755 --- a/core.py +++ b/core.py @@ -566,10 +566,7 @@ def core_main(file_path, number_th): c_word = '-C' # 中文字幕影片后缀 # 判断是否无码 - if is_uncensored(number): - uncensored = 1 - else: - uncensored = 0 + uncensored = 1 if is_uncensored(number) else 0 if '流出' in filepath or 'uncensored' in filepath: diff --git a/number_parser.py b/number_parser.py index 212c2c0..4d4fe93 100755 --- a/number_parser.py +++ b/number_parser.py @@ -71,7 +71,7 @@ G_TAKE_NUM_RULES = { '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'), 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()), 'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]), - 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[-|_]{1}(\d{3,4})[^\d]*', x, re.I)[0]) + 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]) } def get_number_by_dict(filename: str) -> str: