From f21fdcb5f5a5ec242b31f1130d71d54072c2985b Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 3 Oct 2021 09:53:09 +0800
Subject: [PATCH 01/56] log dir adapts to makedirs(), fix CmdLine output

---
 AV_Data_Capture.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index c1c7ee4..5d3263f 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -59,6 +59,7 @@ class OutLogger(object):
     def __init__(self, logfile) -> None:
         self.term = sys.stdout
         self.log = open(logfile,"w",encoding='utf-8',buffering=1)
+        self.filepath = logfile
     def __del__(self):
         self.close()
     def __enter__(self):
@@ -85,6 +86,7 @@ class ErrLogger(OutLogger):
     def __init__(self, logfile) -> None:
         self.term = sys.stderr
         self.log = open(logfile,"w",encoding='utf-8',buffering=1)
+        self.filepath = logfile
     def close(self):
         if self.term != None:
             sys.stderr = self.term
@@ -97,10 +99,15 @@ class ErrLogger(OutLogger):
 def dupe_stdout_to_logfile(logdir: str):
     if not isinstance(logdir, str) or len(logdir) == 0:
         return
-    if not os.path.isdir(logdir):
-        os.makedirs(logdir)
+    if not os.path.exists(logdir):
+        try:
+            os.makedirs(logdir)
+        except:
+            pass
         if not os.path.isdir(logdir):
             return
+    elif not os.path.isdir(logdir):
+        return
 
     log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S")
     logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt')
@@ -113,8 +120,16 @@ def dupe_stdout_to_logfile(logdir: str):
 def close_logfile(logdir: str):
     if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir):
         return
+    #日志关闭前保存日志文件路径
+    filepath = ''
+    try:
+        filepath = sys.stdout.filepath
+    except:
+        pass
     sys.stdout.close()
     sys.stderr.close()
+    if len(filepath):
+        print("Log file '{}' saved.".format(filepath))
     # 清理空文件
     for current_dir, subdirs, files in os.walk(logdir, topdown=False):
         try:
@@ -304,7 +319,8 @@ if __name__ == '__main__':
         print('[+]Enable debug')
     if conf.soft_link():
         print('[!]Enable soft link')
-    #print('[!]CmdLine:'," ".join(sys.argv[1:]))
+    if len(sys.argv)>1:
+        print('[!]CmdLine:'," ".join(sys.argv[1:]))
 
     create_failed_folder(conf.failed_folder())
     start_time = time.time()
@@ -353,9 +369,10 @@ if __name__ == '__main__':
         " End at", time.strftime("%Y-%m-%d %H:%M:%S"))
 
     print("[+]All finished!!!")
-    if not (conf.auto_exit() or auto_exit):
-        input("Press enter key exit, you can check the error message before you exit...")
 
     close_logfile(logdir)
 
+    if not (conf.auto_exit() or auto_exit):
+        input("Press enter key exit, you can check the error message before you exit...")
+
     sys.exit(0)

From f52db0011c222ae6c543b11588eab34fb1a34e4a Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 3 Oct 2021 10:21:47 +0800
Subject: [PATCH 02/56] optimize if logic

---
 AV_Data_Capture.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 5d3263f..1f2df85 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -104,9 +104,7 @@ def dupe_stdout_to_logfile(logdir: str):
             os.makedirs(logdir)
         except:
             pass
-        if not os.path.isdir(logdir):
-            return
-    elif not os.path.isdir(logdir):
+    if not os.path.isdir(logdir):
         return
 
     log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S")

From 8ef87c285fbbfd615dac9a5efa5c5ec927bc8ff6 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 3 Oct 2021 10:43:54 +0800
Subject: [PATCH 03/56] =?UTF-8?q?=E5=86=8D=E5=B0=86=E5=85=B6=E5=AE=83?=
 =?UTF-8?q?=E5=87=A0=E4=B8=AAmakedirs()=E4=B8=80=E8=B5=B7=E4=BF=AE?=
 =?UTF-8?q?=E6=AD=A3=EF=BC=8C=E5=8E=BB=E6=8E=89=E9=94=99=E4=B8=8A=E5=8A=A0?=
 =?UTF-8?q?=E9=94=99=E7=9A=84=E6=8F=90=E5=8D=87=E5=88=B0admin=E5=BB=BA?=
 =?UTF-8?q?=E8=AE=AE=E4=BF=A1=E6=81=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AV_Data_Capture.py |  6 ++----
 core.py            | 40 ++++++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 1f2df85..766d826 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -214,13 +214,11 @@ def movie_lists(root, conf, regexstr):
 
 
 def create_failed_folder(failed_folder):
-    if not os.path.isdir(failed_folder):  # 新建failed文件夹
+    if not os.path.exists(failed_folder):  # 新建failed文件夹
         try:
             os.makedirs(failed_folder)
-            if not os.path.isdir(failed_folder):
-                raise
         except:
-            print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)")
+            print(f"[-]Fatal error! Can not make folder '{failed_folder}'")
             sys.exit(0)
 
 
diff --git a/core.py b/core.py
index cb1a782..90da00c 100755
--- a/core.py
+++ b/core.py
@@ -83,17 +83,19 @@ def create_folder(json_data, conf: config.Config):  # 创建文件夹
         location_rule = location_rule.replace(title, shorttitle)
 
     path = os.path.join(success_folder, location_rule).strip()
-    if not os.path.isdir(path):
+    if not os.path.exists(path):
         path = escape_path(path, conf.escape_literals())
         try:
             os.makedirs(path)
-            if not os.path.isdir(path):
-                raise
         except:
             path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number")
             path = escape_path(path, conf.escape_literals())
+            try:
+                os.makedirs(path)
+            except:
+                print(f"[-]Fatal error! Can not make folder '{path}'")
+                sys.exit(0)
 
-            os.makedirs(path)
     return path
 
 
@@ -106,10 +108,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
     for i in range(configProxy.retry):
         try:
             if configProxy.enable:
-                if not os.path.isdir(path):
-                    os.makedirs(path)
-                    if not os.path.isdir(path):
-                        raise IOError
+                if not os.path.exists(path):
+                    try:
+                        os.makedirs(path)
+                    except:
+                        print(f"[-]Fatal error! Can not make folder '{path}'")
+                        sys.exit(0)
                 proxies = configProxy.proxies()
                 headers = {
                     'User-Agent': G_USER_AGENT}
@@ -121,10 +125,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
                     code.write(r.content)
                 return
             else:
-                if not os.path.isdir(path):
-                    os.makedirs(path)
-                    if not os.path.isdir(path):
-                        raise IOError
+                if not os.path.exists(path):
+                    try:
+                        os.makedirs(path)
+                    except:
+                        print(f"[-]Fatal error! Can not make folder '{path}'")
+                        sys.exit(0)
                 headers = {
                     'User-Agent': G_USER_AGENT}
                 r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -224,10 +230,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
     else:
         nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo")
     try:
-        if not os.path.isdir(path):
-            os.makedirs(path)
-            if not os.path.isdir(path):
-                raise IOError
+        if not os.path.exists(path):
+            try:
+                os.makedirs(path)
+            except:
+                print(f"[-]Fatal error! can not make folder '{path}'")
+                sys.exit(0)
         with open(nfo_path, "wt", encoding='UTF-8') as code:
             print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
             print("<movie>", file=code)

From 6d1e99d8ab98ac991d601c2708fe14b4267772e1 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 3 Oct 2021 10:44:57 +0800
Subject: [PATCH 04/56] fix issue 603

---
 core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core.py b/core.py
index 90da00c..7794b9e 100755
--- a/core.py
+++ b/core.py
@@ -96,7 +96,7 @@ def create_folder(json_data, conf: config.Config):  # 创建文件夹
                 print(f"[-]Fatal error! Can not make folder '{path}'")
                 sys.exit(0)
 
-    return path
+    return os.path.normcase(path)
 
 
 # =====================资源下载部分===========================

From 952e2c9a30377b677ec9167041d3d51c727065f0 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 3 Oct 2021 10:59:25 +0800
Subject: [PATCH 05/56] =?UTF-8?q?=E6=89=80=E6=9C=89makedirs()=E5=A4=B1?=
 =?UTF-8?q?=E8=B4=A5=E5=81=9A=E7=9B=B8=E5=90=8C=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AV_Data_Capture.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 766d826..257d9b0 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -103,7 +103,8 @@ def dupe_stdout_to_logfile(logdir: str):
         try:
             os.makedirs(logdir)
         except:
-            pass
+            print(f"[-]Fatal error! Can not make log folder '{logdir}'")
+            sys.exit(0)
     if not os.path.isdir(logdir):
         return
 

From 5df03392793ff5873427f044261c2a162c1f3219 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 4 Oct 2021 23:57:16 +0800
Subject: [PATCH 06/56] =?UTF-8?q?=E7=94=A8normpath()=E6=89=8D=E8=83=BD?=
 =?UTF-8?q?=E7=BB=B4=E6=8C=81=E5=8E=9F=E6=9D=A5=E7=9A=84=E5=A4=A7=E5=B0=8F?=
 =?UTF-8?q?=E5=86=99=EF=BC=8Cnormcase()=E4=BC=9A=E5=85=A8=E9=83=A8?=
 =?UTF-8?q?=E5=8F=98=E4=B8=BA=E5=B0=8F=E5=86=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core.py b/core.py
index 7794b9e..3ca9eb2 100755
--- a/core.py
+++ b/core.py
@@ -96,7 +96,7 @@ def create_folder(json_data, conf: config.Config):  # 创建文件夹
                 print(f"[-]Fatal error! Can not make folder '{path}'")
                 sys.exit(0)
 
-    return os.path.normcase(path)
+    return os.path.normpath(path)
 
 
 # =====================资源下载部分===========================

From 3183d284b78c8d281129813d5f0da3f7c9083276 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 08:33:03 +0800
Subject: [PATCH 07/56] number_parser.py:add more studio, unit test, full disk
 search as unit test

---
 number_parser.py | 173 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 119 insertions(+), 54 deletions(-)

diff --git a/number_parser.py b/number_parser.py
index 2d1874e..616af85 100755
--- a/number_parser.py
+++ b/number_parser.py
@@ -1,14 +1,13 @@
 import os
 import re
-from core import *
-
+import sys
 
 G_spat = re.compile(
     "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
     re.IGNORECASE)
 
 
-def get_number(debug,filepath: str) -> str:
+def get_number(debug,file_path: str) -> str:
     # """
     # >>> from number_parser import get_number
     # >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4")
@@ -32,77 +31,143 @@ def get_number(debug,filepath: str) -> str:
     # >>> get_number("snis-829-C.mp4")
     # 'snis-829'
     # """
-    filepath = os.path.basename(filepath)
-
-    if debug == False:
-        try:
-            if '-' in filepath or '_' in filepath:  # 普通提取番号 主要处理包含减号-和_的番号
-                #filepath = filepath.replace("_", "-")
-                filepath = G_spat.sub("", filepath)
-                filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath))  # 去除文件名中时间
-                lower_check = filename.lower()
-                if 'fc2' in lower_check:
-                    filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
-                file_number = get_number_by_dict(lower_check)
-                if file_number:
-                    return file_number
-                return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
-            else:  # 提取不含减号-的番号，FANZA CID
-                # 欧美番号匹配规则
-                oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
-                if oumei:
-                    return oumei.group()
-
-                try:
-                    return str(
-                        re.findall(r'(.+?)\.',
-                                   str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
-                        "['']").replace('_', '-')
-                except:
-                    return re.search(r'(.+?)\.', filepath)[0]
-        except Exception as e:
-            print('[-]' + str(e))
-            return
-    elif debug == True:
-        if '-' in filepath or '_' in filepath:  # 普通提取番号 主要处理包含减号-和_的番号
-            #filepath = filepath.replace("_", "-")
+    filepath = os.path.basename(file_path)
+    # debug True 和 False 两块代码块合并，原因是此模块及函数只涉及字符串计算，没有IO操作，debug on时输出导致异常信息即可
+    try:
+        file_number = get_number_by_dict(filepath)
+        if file_number:
+            return file_number
+        elif '-' in filepath or '_' in filepath:  # 普通提取番号 主要处理包含减号-和_的番号
             filepath = G_spat.sub("", filepath)
             filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath))  # 去除文件名中时间
             lower_check = filename.lower()
             if 'fc2' in lower_check:
                 filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
-            file_number = get_number_by_dict(lower_check)
-            if file_number:
-                return file_number
             return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
         else:  # 提取不含减号-的番号，FANZA CID
             # 欧美番号匹配规则
             oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
             if oumei:
                 return oumei.group()
-
             try:
                 return str(
                     re.findall(r'(.+?)\.',
-                               str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
+                                str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
                     "['']").replace('_', '-')
             except:
-                return re.search(r'(.+?)\.', filepath)[0]
+                return str(re.search(r'(.+?)\.', filepath)[0])
+    except Exception as e:
+        if debug:
+            print(f'[-]Number Parser exception: {e} [{file_path}]')
+        return None
 
+
+# 按javdb数据源的命名规范提取number
 G_TAKE_NUM_RULES = {
-    'tokyo' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.A).group()),
-    'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('_', '-'),
-    '1pon'  : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('-', '_'),
-    '10mu'  : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.A).group()).replace('-', '_'),
-    'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.A).group())
-    }
+    'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
+    'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
+    '1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
+    '10mu'  : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
+    'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
+    'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
+    'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[-|_]{1}(\d{3,4})[^\d]*', x, re.I)[0])
+}
 
-def get_number_by_dict(lower_filename: str) -> str:
-    for k,v in G_TAKE_NUM_RULES.items():
-        if k in lower_filename:
-            return v(lower_filename)
+def get_number_by_dict(filename: str) -> str:
+    try:
+        for k,v in G_TAKE_NUM_RULES.items():
+            if re.search(k, filename, re.I):
+                return v(filename)
+    except:
+        pass
     return None
 
-# if __name__ == "__main__":
+if __name__ == "__main__":
 #     import doctest
 #     doctest.testmod(raise_on_error=True)
+    test_use_cases = (
+        "Tokyo Hot n9001 FHD.mp4", # 无-号，以前无法正确提取
+        "TokyoHot-n1287-HD SP2006 .mp4",
+        "caribean-020317_001.nfo",     # -号误命名为_号的
+        "257138_3xplanet_1Pondo_080521_001.mp4",
+        "ADV-R0624-CD3.wmv",           # 多碟影片
+        "XXX-AV   22061-CD5.iso",      # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源
+        "xxx-av 20589.mp4",
+        "Muramura-102114_145-HD.wmv",  # 新支持片商格式 102114_145  命名规则来自javdb数据源
+        "heydouga-4102-023-CD2.iso",   # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
+        "HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
+        "pacopacomama-093021_539-FHD.mkv" # 新支持片商格式 093021_539 命名规则来自javdb数据源
+    )
+    def evprint(evstr):
+        code = compile(evstr, "<string>", "eval")
+        print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
+    for t in test_use_cases:
+        evprint(f'get_number(True, "{t}")')
+
+    if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
+        sys.exit(0)
+
+    # 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据，参数为盘符 A .. Z 或带盘符路径
+    # https://www.voidtools.com/support/everything/command_line_interface/
+    # ES命令行工具需要Everything文件搜索引擎处于运行状态，es.exe单个执行文件需放入PATH路径中。
+    # Everything是免费软件
+    # 示例：
+    # python.exe .\number_parser.py ALL                 # 从所有磁盘搜索视频
+    # python.exe .\number_parser.py D                   # 从D盘搜索
+    # python.exe .\number_parser.py D:                  # 同上
+    # python.exe .\number_parser.py D:\download\JAVs    # 搜索D盘的\download\JAVs目录，路径必须带盘符
+    # ==================
+    # Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据
+    # 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引
+    # MAC OS X 使用findutils的glocate，需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引
+    # 示例：
+    # python3 ./number_parser.py ALL
+    import subprocess
+    ES_search_path = "ALL disks"
+    if sys.argv[1] == "ALL":
+        if sys.platform == "win32":
+            # ES_prog_path = 'C:/greensoft/es/es.exe'
+            ES_prog_path = 'es.exe'  # es.exe需要放在PATH环境变量的路径之内
+            ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v'
+            out_bytes = subprocess.check_output(ES_cmdline.split(' '))
+            out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030，此编码为UNICODE方言与UTF-8系全射关系无转码损失
+            out_list = out_text.splitlines()
+        elif sys.platform in ("linux", "darwin"):
+            ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
+            ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path)
+            out_bytes = subprocess.check_output(ES_cmdline.split(' '))
+            out_text = out_bytes.decode('utf-8')
+            out_list = [ os.path.basename(line) for line in out_text.splitlines()]
+        else:
+            print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.')
+            sys.exit(1)
+    else: # Windows single disk
+        if sys.platform != "win32":
+            print('[!]Usage: python3 ./number_parser.py ALL')
+            sys.exit(0)
+        # ES_prog_path = 'C:/greensoft/es/es.exe'
+        ES_prog_path = 'es.exe'  # es.exe需要放在PATH环境变量的路径之内
+        if os.path.isdir(sys.argv[1]):
+            ES_search_path = sys.argv[1]
+        else:
+            ES_search_path = sys.argv[1][0] + ':/'
+            if not os.path.isdir(ES_search_path):
+                ES_search_path = 'C:/'
+            ES_search_path = os.path.normcase(ES_search_path)
+        ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v'
+        out_bytes = subprocess.check_output(ES_cmdline.split(' '))
+        out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030，此编码为UNICODE方言与UTF-8系全射关系无转码损失
+        out_list = out_text.splitlines()
+    print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...')
+    print(f'[+]Find {len(out_list)} Movies.')
+    for filename in out_list:
+        try:
+            n = get_number(True, filename)
+            if n:
+                print(f'  [{n}] # {filename}')
+            else:
+                print(f'[-]Number return None. # {filename}')
+        except Exception as e:
+            print(f'[-]Number Parser exception: {e} [{filename}]')
+
+    sys.exit(0)

From 39ad0257603f734a812cea0a0ad2fe487b31b29b Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 10:22:05 +0800
Subject: [PATCH 08/56] config.py:override config settings by cmd params,
 pyinstaller add config.ini

---
 Makefile           |   4 +-
 config.ini         |   2 +
 config.py          | 149 +++++++++++++++++++++++++++++++++++++--------
 py_to_exe.ps1      |   5 +-
 wrapper/FreeBSD.sh |   6 +-
 wrapper/Linux.sh   |   6 +-
 6 files changed, 143 insertions(+), 29 deletions(-)

diff --git a/Makefile b/Makefile
index 407aa4b..4c8960a 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,9 @@ make:
 	#export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1)
 
 	@echo "[+]Pyinstaller make"
-	pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py --add-data "Img:Img"
+	pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py \
+		--add-data "Img:Img" \
+		--add-data "config.ini:." \
 
 	@echo "[+]Move to bin"
 	if [ ! -d "./bin" ];then  mkdir bin; fi
diff --git a/config.ini b/config.ini
index 58e6892..f33a578 100755
--- a/config.ini
+++ b/config.ini
@@ -1,5 +1,6 @@
 [common]
 main_mode=1
+source_folder=./
 failed_output_folder=failed
 success_output_folder=JAV_output
 soft_link=0
@@ -16,6 +17,7 @@ nfo_skip_days=30
 ; 处理完多少个视频文件后停止，0为处理所有视频文件
 stop_counter=0
 ; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
+ignore_failed_list=0
 
 [proxy]
 ;proxytype: http or socks5 or socks5h switch: 0 1
diff --git a/config.py b/config.py
index 82fd345..2b49ca0 100644
--- a/config.py
+++ b/config.py
@@ -1,33 +1,83 @@
 import os
+import re
 import sys
 import configparser
-import codecs
 from pathlib import Path
 
+
+G_conf_override = {
+    # index 0 save Config() first instance for quick access by using getInstance()
+    0 : None,
+    # register override config items
+    "common:main_mode" : None,
+    "common:source_folder" : None,
+    "common:auto_exit" : None,
+    "common:nfo_skip_days" : None,
+    "common:stop_counter" : None,
+    "common:ignore_failed_list" : None,
+    "debug_mode:switch" : None
+}
+
+
+def getInstance():
+    if isinstance(G_conf_override[0], Config):
+        return G_conf_override[0]
+    return Config()
+
+
 class Config:
     def __init__(self, path: str = "config.ini"):
-        path_search_order = [
-            path,
-            "./config.ini",
-            os.path.join(Path.home(), "avdc.ini"),
-            os.path.join(Path.home(), ".avdc.ini"),
-            os.path.join(Path.home(), ".avdc/config.ini"),
-            os.path.join(Path.home(), ".config/avdc/config.ini")
-        ]
+        path_search_order = (
+            Path(path),
+            Path.cwd() / "config.ini",
+            Path.home() / "avdc.ini",
+            Path.home() / ".avdc.ini",
+            Path.home() / ".avdc/config.ini",
+            Path.home() / ".config/avdc/config.ini"
+        )
         ini_path = None
         for p in path_search_order:
-            if os.path.isfile(p):
-                ini_path = p
+            if p.is_file():
+                ini_path = p.resolve()
                 break
         if ini_path:
             self.conf = configparser.ConfigParser()
+            self.ini_path = ini_path
             try:
-                self.conf.read(ini_path, encoding="utf-8-sig")
+                if self.conf.read(ini_path, encoding="utf-8-sig"):
+                    if G_conf_override[0] is None:
+                        G_conf_override[0] = self
             except:
-                self.conf.read(ini_path, encoding="utf-8")
+                if self.conf.read(ini_path, encoding="utf-8"):
+                    if G_conf_override[0] is None:
+                        G_conf_override[0] = self
         else:
-            print("[-]Config file not found!")
-            sys.exit(2)
+            print("ERROR: Config file not found!")
+            print("Please put config file into one of the following path:")
+            print('\n'.join([str(p.resolve()) for p in path_search_order[2:]]))
+            # 对于找不到配置文件的情况，还是在打包时附上对应版本的默认配置文件，有需要时为其在搜索路径中生成，
+            # 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来，单个执行文件就是功能完整的了，放在任何
+            # 执行路径下都可以放心使用。
+            res_path = None
+            # pyinstaller打包的在打包中找config.ini
+            if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file():
+                res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini'
+            # 脚本运行的所在位置找
+            elif (Path(__file__).resolve().parent / 'config.ini').is_file():
+                res_path = Path(__file__).resolve().parent / 'config.ini'
+            if res_path is None:
+                sys.exit(2)
+            ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:")
+            if re.search('n', ins, re.I):
+                sys.exit(2)
+            # 用户目录才确定具有写权限，因此选择 ~/avdc.ini 作为配置文件生成路径，而不是有可能并没有写权限的
+            # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了，只是作为多配置文件的切换技巧保留。
+            write_path = path_search_order[2]   # Path.home() / "avdc.ini"
+            with open(write_path, 'w', encoding='utf-8') as wcfg:
+                wcfg.write(res_path.read_text(encoding='utf-8'))
+            print("Config file '{}' created.".format(write_path.resolve()))
+            input("Press Enter key exit...")
+            sys.exit(0)
             # self.conf = self._default_config()
             # try:
             #     self.conf = configparser.ConfigParser()
@@ -40,13 +90,24 @@ class Config:
             #     print("[-]",e)
             #     sys.exit(3)
             #     #self.conf = self._default_config()
+    def getboolean_override(self, section, item) -> bool:
+        return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"])
 
-    def main_mode(self) -> str:
+    def getint_override(self, section, item) -> int:
+        return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"])
+
+    def get_override(self, section, item) -> str:
+        return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"])
+
+    def main_mode(self) -> int:
         try:
-            return self.conf.getint("common", "main_mode")
+            return self.getint_override("common", "main_mode")
         except ValueError:
             self._exit("common:main_mode")
 
+    def source_folder(self) -> str:
+        return self.get_override("common", "source_folder")
+
     def failed_folder(self) -> str:
         return self.conf.get("common", "failed_output_folder")
 
@@ -61,7 +122,7 @@ class Config:
     def failed_move(self) -> bool:
         return self.conf.getboolean("common", "failed_move")
     def auto_exit(self) -> bool:
-        return self.conf.getboolean("common", "auto_exit")
+        return self.getboolean_override("common", "auto_exit")
     def transalte_to_sc(self) -> bool:
         return self.conf.getboolean("common", "transalte_to_sc")
     def multi_threading(self) -> bool:
@@ -70,14 +131,16 @@ class Config:
         return self.conf.getboolean("common", "del_empty_folder")
     def nfo_skip_days(self) -> int:
         try:
-            return self.conf.getint("common", "nfo_skip_days")
+            return self.getint_override("common", "nfo_skip_days")
         except:
             return 30
     def stop_counter(self) -> int:
         try:
-            return self.conf.getint("common", "stop_counter")
+            return self.getint_override("common", "stop_counter")
         except:
             return 0
+    def ignore_failed_list(self) -> bool:
+        return self.getboolean_override("common", "ignore_failed_list")
     def is_transalte(self) -> bool:
         return self.conf.getboolean("transalte", "switch")
     def is_trailer(self) -> bool:
@@ -173,7 +236,7 @@ class Config:
         return self.conf.get("escape", "folders")
 
     def debug(self) -> bool:
-        return self.conf.getboolean("debug_mode", "switch")
+        return self.getboolean_override("debug_mode", "switch")
 
     @staticmethod
     def _exit(sec: str) -> None:
@@ -188,6 +251,7 @@ class Config:
         sec1 = "common"
         conf.add_section(sec1)
         conf.set(sec1, "main_mode", "1")
+        conf.set(sec1, "source_folder", "./")
         conf.set(sec1, "failed_output_folder", "failed")
         conf.set(sec1, "success_output_folder", "JAV_output")
         conf.set(sec1, "soft_link", "0")
@@ -199,6 +263,7 @@ class Config:
         conf.set(sec1, "del_empty_folder", "1")
         conf.set(sec1, "nfo_skip_days", 30)
         conf.set(sec1, "stop_counter", 0)
+        conf.set(sec1, "ignore_failed_list", 0)
 
         sec2 = "proxy"
         conf.add_section(sec2)
@@ -308,9 +373,45 @@ if __name__ == "__main__":
         code = compile(evstr, "<string>", "eval")
         print('{}: "{}"'.format(evstr, eval(code)))
     config = Config()
-    mfilter = ('conf', 'proxy', '_exit', '_default_config')
+    mfilter = ('conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path')
     for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
         evprint(f'config.{_m}()')
     pfilter = ('proxies', 'SUPPORT_PROXY_TYPE')
-    for _p in [p for p in dir(config.proxy()) if not p.startswith('__') and p not in pfilter]:
-        evprint(f'config.proxy().{_p}')
+    # test getInstance()
+    assert(getInstance() == config)
+    for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:
+        evprint(f'getInstance().proxy().{_p}')
+
+    # Override Test
+    G_conf_override["common:nfo_skip_days"] = 4321
+    G_conf_override["common:stop_counter"] = 1234
+    assert config.nfo_skip_days() == 4321
+    assert getInstance().stop_counter() == 1234
+    # remove override
+    G_conf_override["common:stop_counter"] = None
+    G_conf_override["common:nfo_skip_days"] = None
+    assert config.nfo_skip_days() != 4321
+    assert config.stop_counter() != 1234
+    # Create new instance
+    conf2 = Config()
+    assert getInstance() != conf2
+    assert getInstance() == config
+    G_conf_override["common:main_mode"] = 9
+    G_conf_override["common:source_folder"] = "A:/b/c"
+    # Override effect to all instances
+    assert config.main_mode() == 9
+    assert conf2.main_mode() == 9
+    assert getInstance().main_mode() == 9
+    assert conf2.source_folder() == "A:/b/c"
+    print("### Override Test ###".center(36))
+    evprint('getInstance().main_mode()')
+    evprint('config.source_folder()')
+    G_conf_override["common:main_mode"] = None
+    evprint('conf2.main_mode()')
+    evprint('config.main_mode()')
+    # unregister key acess will raise except
+    try:
+        print(G_conf_override["common:actor_gender"])
+    except KeyError as ke:
+        print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr)
+    print(f"Load Config file '{conf2.ini_path}'.")
diff --git a/py_to_exe.ps1 b/py_to_exe.ps1
index 7fc0f80..77f169a 100644
--- a/py_to_exe.ps1
+++ b/py_to_exe.ps1
@@ -3,14 +3,15 @@
 
 $CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
 
-mkdir build 
+mkdir build
 mkdir __pycache__
 
 pyinstaller --onefile AV_Data_Capture.py `
     --hidden-import ADC_function.py `
     --hidden-import core.py `
     --add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
-    --add-data "Img;Img"
+    --add-data "Img;Img" `
+    --add-data "config.ini;." `
 
 rmdir -Recurse -Force build
 rmdir -Recurse -Force __pycache__
diff --git a/wrapper/FreeBSD.sh b/wrapper/FreeBSD.sh
index 70f27d7..9717ef4 100755
--- a/wrapper/FreeBSD.sh
+++ b/wrapper/FreeBSD.sh
@@ -1,4 +1,8 @@
 pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448
 pip install pyquery pyinstaller
-pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img"
+pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py \
+    --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
+    --add-data "Img:Img" \
+    --add-data "config.ini:." \
+
 cp config.ini ./dist
diff --git a/wrapper/Linux.sh b/wrapper/Linux.sh
index 1d05e6a..63e3b1c 100755
--- a/wrapper/Linux.sh
+++ b/wrapper/Linux.sh
@@ -12,5 +12,9 @@
 #fi
 pip3 install -r requirements.txt
 pip3 install cloudscraper==1.2.52
-pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img"
+pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py \
+    --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
+    --add-data "Img:Img" \
+    --add-data "config.ini:." \
+
 cp config.ini ./dist

From 8cb57673b05afb8522ecd090d25556f96d057246 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 11:15:30 +0800
Subject: [PATCH 09/56] log auto merge

---
 AV_Data_Capture.py | 96 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 78 insertions(+), 18 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 257d9b0..19a3212 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -99,18 +99,18 @@ class ErrLogger(OutLogger):
 def dupe_stdout_to_logfile(logdir: str):
     if not isinstance(logdir, str) or len(logdir) == 0:
         return
-    if not os.path.exists(logdir):
+    log_dir = Path(logdir)
+    if not log_dir.exists():
         try:
-            os.makedirs(logdir)
+            log_dir.mkdir(parents=True,exist_ok=True)
         except:
-            print(f"[-]Fatal error! Can not make log folder '{logdir}'")
-            sys.exit(0)
-    if not os.path.isdir(logdir):
-        return
-
+            pass
+    if not log_dir.is_dir():
+        return  # Tips for disabling logs by change directory to a same name empty regular file
+    abslog_dir = log_dir.resolve()
     log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S")
-    logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt')
-    errlog = os.path.join(logdir, f'avdc_{log_tmstr}_err.txt')
+    logfile = abslog_dir / f'avdc_{log_tmstr}.txt'
+    errlog = abslog_dir / f'avdc_{log_tmstr}_err.txt'
 
     sys.stdout = OutLogger(logfile)
     sys.stderr = ErrLogger(errlog)
@@ -119,25 +119,85 @@ def dupe_stdout_to_logfile(logdir: str):
 def close_logfile(logdir: str):
     if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir):
         return
-    #日志关闭前保存日志文件路径
-    filepath = ''
+    #日志关闭前保存日志路径
+    filepath = None
     try:
         filepath = sys.stdout.filepath
     except:
         pass
     sys.stdout.close()
     sys.stderr.close()
-    if len(filepath):
-        print("Log file '{}' saved.".format(filepath))
+    log_dir = Path(logdir).resolve()
+    if isinstance(filepath, Path):
+        print(f"Log file '{filepath}' saved.")
+        assert(filepath.parent.samefile(log_dir))
     # 清理空文件
-    for current_dir, subdirs, files in os.walk(logdir, topdown=False):
+    for f in log_dir.glob(r'*_err.txt'):
+        if f.stat().st_size == 0:
+            try:
+                f.unlink(missing_ok=True)
+            except:
+                pass
+    # 合并日志 只检测日志目录内的文本日志，忽略子目录。三个月前的日志，按月合并为一个月志，
+    # 去年及以前的月志，今年4月以后将之按年合并为年志
+    # 测试步骤：
+    """
+    LOGDIR=/tmp/avlog
+    mkdir -p $LOGDIR
+    for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done
+    for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done
+    echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR"
+    # 1932 files in /tmp/avlog
+    avdc -zgic1 -d0 -m3 -o $LOGDIR
+    # python3 ./AV_Data_Capture.py -zgic1 -o $LOGDIR
+    ls $LOGDIR
+    # rm -rf $LOGDIR
+    """
+    # 第一步，合并到月
+    for i in range(1):  # 利用1次循环的break跳到第二步，避免大块if缩进或者使用goto语法
+        txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{8}T\d{6}', f.stem, re.A)]
+        if not txts or not len(txts):
+            break
+        txts.sort()
+        today = datetime.today()
+        tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32T")
+        deadline_month = f'avdc_{tmstr_3_month_ago}'
+        month_merge = [f for f in txts if f.stem < deadline_month]
+        if not month_merge or not len(month_merge):
+            break
+        tomonth = len('01T235959.txt')  # cut length avdc_202012|01T235959.txt
+        for f in month_merge:
+            try:
+                month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt
+                with open(month_file_name, 'a', encoding='utf-8') as m:
+                    m.write(f.read_text(encoding='utf-8'))
+                f.unlink(missing_ok=True)
+            except:
+                pass
+    # 第二步，月合并到年
+    if today.month < 4:
+        return
+    mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{6}', f.stem, re.A)]
+    if not mons or not len(mons):
+        return
+    mons.sort()
+    deadline_year = f'avdc_{today.year-1}13'
+    year_merge = [f for f in mons if f.stem < deadline_year]
+    if not year_merge or not len(year_merge):
+        return
+    toyear = len('12.txt')   # cut length avdc_2020|12.txt
+    for f in year_merge:
         try:
-            for f in files:
-                full_name = os.path.join(current_dir, f)
-                if os.path.getsize(full_name) == 0:
-                    os.remove(full_name)
+            year_file_name = str(f)[:-toyear] + '.txt' # avdc_2020.txt
+            with open(year_file_name, 'a', encoding='utf-8') as y:
+                y.write(f.read_text(encoding='utf-8'))
+            f.unlink(missing_ok=True)
         except:
             pass
+    # 第三步，压缩年志 如果有压缩需求，请自行手工压缩，或者使用外部脚本来定时完成。推荐nongnu的lzip，对于
+    # 这种粒度的文本日志，压缩比是目前最好的。lzip -9的运行参数下，日志压缩比要高于xz -9，而且内存占用更少，
+    # 多核利用率更高(plzip多线程版本)，解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右，
+    # 100MB的日志文件能缩小到3.7MB。
 
 
 # 重写视频文件扫描，消除递归，取消全局变量，新增失败文件列表跳过处理

From cf072e79d1c352e1c271bf98923a4d7905f60ca0 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 11:29:47 +0800
Subject: [PATCH 10/56] =?UTF-8?q?=E8=BE=93=E5=87=BA=E6=8E=92=E7=89=88?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=8Cnumber=E6=94=BE=E5=9C=A8=E5=B7=A6?=
 =?UTF-8?q?=E8=BE=B9=E5=9B=BA=E5=AE=9A=E4=BD=8D=E7=BD=AE=EF=BC=8C=E4=B8=8A?=
 =?UTF-8?q?=E4=B8=80=E8=A1=8C=E7=9A=84=E7=95=99=E7=99=BD=E4=BB=A5=E4=BE=BF?=
 =?UTF-8?q?=E8=BF=85=E9=80=9F=E5=AE=9A=E4=BD=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AV_Data_Capture.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 19a3212..6ab00ad 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -306,7 +306,7 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
     file_path = os.path.abspath(file_path)
 
     if debug == True:
-        print(f"[!]Making Data for [{file_path}], the number is [{n_number}]")
+        print(f"[!] [{n_number}] As Number making data for '{file_path}'")
         if n_number:
             core_main(file_path, n_number, c)
         else:
@@ -314,7 +314,7 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
         print("[*]======================================================")
     else:
         try:
-            print(f"[!]Making Data for [{file_path}], the number is [{n_number}]")
+            print(f"[!] [{n_number}] As Number making data for '{file_path}'")
             if n_number:
                 core_main(file_path, n_number, c)
             else:
@@ -333,8 +333,11 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
 def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number):
     file_name = os.path.basename(file_path)
     try:
-        print("[!]Making Data for [{}], the number is [{}]".format(file_path, custom_number))
-        core_main(file_path, custom_number, c)
+        print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
+        if custom_number:
+            core_main(file_path, custom_number, c)
+        else:
+            print("[-] number empty ERROR")
         print("[*]======================================================")
     except Exception as err:
         print("[-] [{}] ERROR:".format(file_path))
@@ -372,6 +375,7 @@ if __name__ == '__main__':
     if conf.update_check():
         check_update(version)
 
+    print(f"[+]Load Config file '{conf.ini_path}'.")
     if conf.debug():
         print('[+]Enable debug')
     if conf.soft_link():
@@ -408,7 +412,7 @@ if __name__ == '__main__':
         for movie_path in movie_list:  # 遍历电影列表 交给core处理
             count = count + 1
             percentage = str(count / int(count_all) * 100)[:4] + '%'
-            print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
+            print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
             create_data_and_move(movie_path, conf, conf.debug())
             if count >= stop_count:
                 print("[!]Stop counter triggered!")

From a405c5c41bf21f20ea8c8f13b5fac3a40d718c41 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 11:46:35 +0800
Subject: [PATCH 11/56] =?UTF-8?q?WebCrawler:=E5=85=A8=E9=9D=A2=E6=8D=A2?=
 =?UTF-8?q?=E8=A3=85getInstance()=EF=BC=8C=E5=8E=98=E6=B8=85airav.py?=
 =?UTF-8?q?=E4=B8=8Ejavbus.py=E5=8F=8Ajavdb.py=E7=9A=84=E7=9B=B8=E7=88=B1?=
 =?UTF-8?q?=E7=9B=B8=E6=9D=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/__init__.py |  10 +++
 WebCrawler/airav.py    | 194 ++++++++++++++++++++---------------------
 WebCrawler/avsox.py    |   8 +-
 WebCrawler/carib.py    |  61 ++++++-------
 WebCrawler/dlsite.py   |   2 +-
 WebCrawler/fc2.py      |   6 +-
 WebCrawler/fc2club.py  |   4 +-
 WebCrawler/javbus.py   |  23 +++--
 WebCrawler/javdb.py    |  61 ++++++++-----
 WebCrawler/mgstage.py  |   2 +-
 WebCrawler/xcity.py    |   2 +-
 11 files changed, 206 insertions(+), 167 deletions(-)

diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index e1608b6..dc54b46 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -134,6 +134,14 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
         print('[-]Movie Number not found!')
         return None
 
+    # 增加number严格判断，避免提交任何number，总是返回"本橋実来 ADZ335"，这种返回number不一致的数据源故障
+    # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
+    # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
+    # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分，可考虑更换规则，更新相应的number分析和抓取代码。
+    if str(json_data.get('number')).upper() != file_number.upper():
+        print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
+        return None
+
     # ================================================网站规则添加结束================================================
 
     title = json_data.get('title')
@@ -225,6 +233,8 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
     studio = studio.replace('エムズビデオグループ','M’s Video Group')
     studio = studio.replace('ミニマム','Minimum')
     studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
+    studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
+    studio = studio.replace('パコパコママ','pacopacomama')
     studio = re.sub('.*/妄想族','妄想族',studio)
     studio = studio.replace('/',' ')
     # ===  替换Studio片假名 END
diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py
index 5925421..f7b144c 100644
--- a/WebCrawler/airav.py
+++ b/WebCrawler/airav.py
@@ -6,6 +6,7 @@ from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
+from WebCrawler import javbus
 
 '''
 API
@@ -17,95 +18,94 @@ API
 host = 'https://www.airav.wiki'
 
 # airav这个网站没有演员图片，所以直接使用javbus的图
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
-    soup = BeautifulSoup(htmlcode, 'lxml')
-    a = soup.find_all(attrs={'class': 'star-name'})
-    d={}
-    for i in a:
-        l=i.a['href']
-        t=i.get_text()
-        html = etree.fromstring(get_html(l), etree.HTMLParser())
-        p=urljoin("https://www.javbus.com",
-                  str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
-        p2={t:p}
-        d.update(p2)
-    return d
+def getActorPhoto(javbus_json):
+    result = javbus_json.get('actor_photo')
+    if isinstance(result, dict) and len(result):
+        return result
+    return ''
 
 def getTitle(htmlcode):  #获取标题
-    doc = pq(htmlcode)
-    # h5:first-child定位第一个h5标签，妈的找了好久才找到这个语法
-    title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-')
-    try:
-        title2 = re.sub('n\d+-','',title)
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    title = str(html.xpath('/html/head/title/text()')[0])
+    result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
+    return result
 
-        return title2
+def getStudio(htmlcode, javbus_json): #获取厂商 已修改
+    # javbus如果有数据以它为准
+    result = javbus_json.get('studio')
+    if isinstance(result, str) and len(result):
+        return result
+    html = etree.fromstring(htmlcode,etree.HTMLParser())
+    return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
+def getYear(htmlcode, javbus_json):   #获取年份
+    result = javbus_json.get('year')
+    if isinstance(result, str) and len(result):
+        return result
+    release = getRelease(htmlcode, javbus_json)
+    if len(release) != len('2000-01-01'):
+        return ''
+    return release[:4]
+def getCover(htmlcode, javbus_json):  #获取封面图片
+    result = javbus_json.get('cover')
+    if isinstance(result, str) and len(result):
+        return result
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
+def getRelease(htmlcode, javbus_json): #获取出版日期
+    result = javbus_json.get('release')
+    if isinstance(result, str) and len(result):
+        return result
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    try:
+        result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
     except:
-        return title
-
-def getStudio(htmlcode): #获取厂商 已修改
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    # 如果记录中冇导演，厂商排在第4位
-    if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
-    # 如果记录中有导演，厂商排在第5位
-    elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
-    else:
-        result = ''
+        return ''
     return result
-def getYear(htmlcode):   #获取年份
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
-    return result
-def getCover(htmlcode):  #获取封面链接
-    doc = pq(htmlcode)
-    image = doc('a.bigImage')
-    return urljoin("https://www.javbus.com", image.attr('href'))
-def getRelease(htmlcode): #获取出版日期
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
-    return result
-def getRuntime(htmlcode): #获取分钟 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
-    return result
-def getActor(htmlcode):   #获取女优
+def getRuntime(javbus_json): #获取播放时长
+    result = javbus_json.get('runtime')
+    if isinstance(result, str) and len(result):
+        return result
+    return ''
+# airav女优数据库较多日文汉字姓名，javbus较多日语假名，因此airav优先
+def getActor(htmlcode, javbus_json):   #获取女优
     b=[]
-    soup=BeautifulSoup(htmlcode,'lxml')
-    a=soup.find_all(attrs={'class':'star-name'})
-    for i in a:
-        b.append(i.get_text())
-    return b
-def getNum(htmlcode):     #获取番号
     html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
-    return result
-def getDirector(htmlcode): #获取导演 已修改
+    a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
+    for v in a:
+        v = v.strip()
+        if len(v):
+            b.append(v)
+    if len(b):
+        return b
+    result = javbus_json.get('actor')
+    if isinstance(result, list) and len(result):
+        return result
+    return []
+def getNum(htmlcode, javbus_json):     #获取番号
+    result = javbus_json.get('number')
+    if isinstance(result, str) and len(result):
+        return result
     html = etree.fromstring(htmlcode, etree.HTMLParser())
-    if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
-    else:
-        result = ''         # 记录中有可能没有导演数据
+    title = str(html.xpath('/html/head/title/text()')[0])
+    result = str(re.findall('^\[(.*?)]', title)[0])
     return result
-
-def getOutline(htmlcode):  #获取演员
+def getDirector(javbus_json): #获取导演 已修改
+    result = javbus_json.get('director')
+    if isinstance(result, str) and len(result):
+        return result
+    return ''
+def getOutline(htmlcode):  #获取概述
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     try:
-        result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','')
+        result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
         return result
     except:
         return ''
-def getSerise(htmlcode):   #获取系列 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    # 如果记录中冇导演，系列排在第6位
-    if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
-    # 如果记录中有导演，系列排在第7位
-    elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
-    else:
-        result = ''
-    return result
+def getSerise(javbus_json):   #获取系列 已修改
+    result = javbus_json.get('series')
+    if isinstance(result, str) and len(result):
+        return result
+    return ''
 def getTag(htmlcode):  # 获取标签
     tag = []
     soup = BeautifulSoup(htmlcode, 'lxml')
@@ -169,52 +169,50 @@ def main(number):
     try:
         try:
             htmlcode = get_html('https://cn.airav.wiki/video/' + number)
-            javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number)
-
+            javbus_json = json.loads(javbus.main(number))
 
         except:
             print(number)
 
         dic = {
             # 标题可使用airav
-            'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
-            # 制作商选择使用javbus
-            'studio': getStudio(javbus_htmlcode),
-            # 年份也是用javbus
-            'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()),
+            'title': getTitle(htmlcode),
+            # 制作商先找javbus，如果没有再找本站
+            'studio': getStudio(htmlcode, javbus_json),
+            # 年份先试javbus，如果没有再找本站
+            'year': getYear(htmlcode, javbus_json),
             #  简介 使用 airav
             'outline': getOutline(htmlcode),
             # 使用javbus
-            'runtime': getRuntime(javbus_htmlcode),
+            'runtime': getRuntime(javbus_json),
             # 导演 使用javbus
-            'director': getDirector(javbus_htmlcode),
-            # 作者 使用airav
-            'actor': getActor(javbus_htmlcode),
-            # 发售日使用javbus
-            'release': getRelease(javbus_htmlcode),
+            'director': getDirector(javbus_json),
+            # 演员 先试airav
+            'actor': getActor(htmlcode, javbus_json),
+            # 发售日先试javbus
+            'release': getRelease(htmlcode, javbus_json),
             # 番号使用javbus
-            'number': getNum(javbus_htmlcode),
+            'number': getNum(htmlcode, javbus_json),
             # 封面链接 使用javbus
-            'cover': getCover(javbus_htmlcode),
+            'cover': getCover(htmlcode, javbus_json),
             # 剧照获取
             'extrafanart': getExtrafanart(htmlcode),
             'imagecut': 1,
             # 使用 airav
             'tag': getTag(htmlcode),
             # 使用javbus
-            'label': getSerise(javbus_htmlcode),
+            'label': getSerise(javbus_json),
             # 妈的，airav不提供作者图片
-            'actor_photo': getActorPhoto(javbus_htmlcode),
-
+#            'actor_photo': getActorPhoto(javbus_json),
             'website': 'https://www.airav.wiki/video/' + number,
             'source': 'airav.py',
             # 使用javbus
-            'series': getSerise(javbus_htmlcode),
+            'series': getSerise(javbus_json)
         }
         js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )  # .encode('UTF-8')
         return js
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         data = {
             "title": "",
@@ -226,6 +224,6 @@ def main(number):
 
 
 if __name__ == '__main__':
-    #print(main('ADN-188'))
-    print(main('ADN-188'))
-    print(main('CJOD-278'))
+    print(main('ADV-R0624'))  # javbus页面返回404, airav有数据
+    print(main('ADN-188'))    # 一人
+    print(main('CJOD-278'))   # 多人 javbus演员名称采用日语假名，airav采用日文汉字
diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py
index 254f3e8..293769a 100644
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -100,6 +100,9 @@ def main(number):
     soup = BeautifulSoup(web, 'lxml')
     info = str(soup.find(attrs={'class': 'row movie'}))
     try:
+        new_number = getNum(info)
+        if new_number.upper() != number.upper():
+            raise ValueError('number not found')
         dic = {
             'actor': getActor(web),
             'title': getTitle(web).strip(getNum(web)),
@@ -108,7 +111,7 @@ def main(number):
             'runtime': getRuntime(info),
             'director': '',  #
             'release': getRelease(info),
-            'number': getNum(info),
+            'number': new_number,
             'cover': getCover(web),
             'cover_small': getCover_small(a),
             'imagecut': 3,
@@ -121,7 +124,7 @@ def main(number):
             'series': getSeries(info),
         }
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         dic = {"title": ""}
     js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -129,3 +132,4 @@ def main(number):
 
 if __name__ == "__main__":
     print(main('012717_472'))
+    print(main('1')) # got fake result raise 'number not found'
diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
index 8eee1af..c1a25d9 100755
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -1,51 +1,53 @@
 import sys
 sys.path.append('../')
 import json
-from bs4 import BeautifulSoup
 from lxml import html
 import re
 from ADC_function import *
 
 def main(number: str) -> json:
     try:
-        caribbytes, browser = get_html_by_browser(
+        carib_obj, browser = get_html_by_browser(
             'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
             return_type="browser")
 
-        if not caribbytes or not caribbytes.ok:
+        if not carib_obj or not carib_obj.ok:
             raise ValueError("page not found")
 
         lx = html.fromstring(str(browser.page))
 
         if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
             raise ValueError("page info not found")
+
+        dic = {
+            'title': get_title(lx),
+            'studio': '加勒比',
+            'year': get_year(lx),
+            'outline': get_outline(lx),
+            'runtime': get_runtime(lx),
+            'director': '',
+            'actor': get_actor(lx),
+            'release': get_release(lx),
+            'number': number,
+            'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
+            'tag': get_tag(lx),
+            'extrafanart': get_extrafanart(lx),
+            'label': get_series(lx),
+            'imagecut': 1,
+#            'actor_photo': get_actor_photo(browser),
+            'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
+            'source': 'carib.py',
+            'series': get_series(lx),
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
+        return js
+
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         dic = {"title": ""}
         return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
-    dic = {
-        'title': get_title(lx),
-        'studio': '加勒比',
-        'year': get_year(lx),
-        'outline': get_outline(lx),
-        'runtime': get_runtime(lx),
-        'director': '',
-        'actor': get_actor(lx),
-        'release': get_release(lx),
-        'number': number,
-        'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
-        'tag': get_tag(lx),
-        'extrafanart': get_extrafanart(lx),
-        'label': get_series(lx),
-        'imagecut': 1,
-#        'actor_photo': get_actor_photo(browser),
-        'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
-        'source': 'carib.py',
-        'series': get_series(lx),
-    }
-    js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
-    return js
+
 
 def get_title(lx: html.HtmlElement) -> str:
     return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
@@ -114,11 +116,10 @@ def get_actor_photo(browser):
         if pos<0:
             continue
         css = html[pos:pos+100]
-        p0 = css.find('background: url(')
-        p1 = css.find('.jpg)')
-        if p0<0 or p1<0:
+        cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
+        if not cssBGjpgs or not len(cssBGjpgs[0]):
             continue
-        p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
+        p = {k: urljoin(browser.url, cssBGjpgs[0])}
         o.update(p)
     return o
 
diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py
index 066e04f..d22cdb1 100644
--- a/WebCrawler/dlsite.py
+++ b/WebCrawler/dlsite.py
@@ -153,7 +153,7 @@ def main(number):
         js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
         return js
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         data = {
             "title": "",
diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py
index e6ae516..0a51fdc 100644
--- a/WebCrawler/fc2.py
+++ b/WebCrawler/fc2.py
@@ -93,10 +93,11 @@ def main(number):
             actor = '素人'
         lx = etree.fromstring(htmlcode2, etree.HTMLParser())
         cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
+        cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
         dic = {
             'title': lx.xpath('/html/head/title/text()')[0],
             'studio': getStudio_fc2com(htmlcode2),
-            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),   
+            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
             'outline': '',  # getOutline_fc2com(htmlcode2),
             'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
             'director': getStudio_fc2com(htmlcode2),
@@ -116,7 +117,7 @@ def main(number):
             'series': '',
         }
     except Exception as e:
-        if ADC_function.config.Config().debug():
+        if ADC_function.config.getInstance().debug():
             print(e)
         dic = {"title": ""}
     js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -124,4 +125,5 @@ def main(number):
 
 if __name__ == '__main__':
     print(main('FC2-1787685'))
+    print(main('FC2-2086710'))
 
diff --git a/WebCrawler/fc2club.py b/WebCrawler/fc2club.py
index 7d0fac6..df14b3b 100644
--- a/WebCrawler/fc2club.py
+++ b/WebCrawler/fc2club.py
@@ -84,7 +84,7 @@ def main(number):
         dic = {
             'title': getTitle_fc2com(htmlcode2),
             'studio': getStudio_fc2com(htmlcode2),
-            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),   
+            'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
             'outline': '',  # getOutline_fc2com(htmlcode2),
             'runtime': '',
             'director': getStudio_fc2com(htmlcode2),
@@ -103,7 +103,7 @@ def main(number):
             'series': '',
         }
     except Exception as e:
-        if ADC_function.config.Config().debug():
+        if ADC_function.config.getInstance().debug():
             print(e)
         dic = {"title": ""}
     js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index 7446ef3..1af4359 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -6,8 +6,7 @@ from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
-from WebCrawler import fanza
-from WebCrawler import airav
+import inspect
 
 def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
     soup = BeautifulSoup(htmlcode, 'lxml')
@@ -82,12 +81,16 @@ def getCID(htmlcode):
     result = re.sub('/.*?.jpg','',string)
     return result
 def getOutline(number):  #获取剧情介绍
+    if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
+        return ''   # 从airav.py过来的调用不计算outline直接返回，避免重复抓取数据拖慢处理速度
     try:
-        response = json.loads(airav.main(number))
-        result = response['outline']
+        htmlcode = get_html('https://cn.airav.wiki/video/' + number)
+        from WebCrawler.airav import getOutline as airav_getOutline
+        result = airav_getOutline(htmlcode)
         return result
     except:
-        return ''
+        pass
+    return ''
 def getSerise(htmlcode):   #获取系列 已修改
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     # 如果记录中冇导演，系列排在第6位
@@ -117,13 +120,15 @@ def getExtrafanart(htmlcode):  # 获取剧照
         extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
         extrafanart_imgs = extrafanart_pather.findall(html)
         if extrafanart_imgs:
-            return extrafanart_imgs
+            return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
     return ''
 
 def main_uncensored(number):
     htmlcode = get_html('https://www.javbus.com/ja/' + number)
     if getTitle(htmlcode) == '':
         htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
+    if "<title>404 Page Not Found" in htmlcode:
+        raise Exception('404 page not found')
     dic = {
         'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
         'studio': getStudio(htmlcode),
@@ -155,6 +160,8 @@ def main(number):
                 htmlcode = get_html('https://www.fanbus.us/' + number)
             except:
                 htmlcode = get_html('https://www.javbus.com/' + number)
+            if "<title>404 Page Not Found" in htmlcode:
+                raise Exception('404 page not found')
             dic = {
                 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
                 'studio': getStudio(htmlcode),
@@ -180,7 +187,7 @@ def main(number):
         except:
             return main_uncensored(number)
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         data = {
             "title": "",
@@ -191,5 +198,7 @@ def main(number):
         return js
 
 if __name__ == "__main__" :
+    print(main('ADV-R0624'))    # 404
     print(main('ipx-292'))
     print(main('CEMD-011'))
+    print(main('CJOD-278'))
diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index ecc4f36..756be1c 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -5,7 +5,7 @@ from lxml import etree
 import json
 from bs4 import BeautifulSoup
 from ADC_function import *
-from WebCrawler import airav
+import secrets
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -21,7 +21,7 @@ def getActor(a):
     genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
     r = []
     idx = 0
-    actor_gendor = config.Config().actor_gender()
+    actor_gendor = config.getInstance().actor_gender()
     if not actor_gendor in ['female','male','both','all']:
         actor_gendor = 'female'
     for act in actors:
@@ -67,9 +67,15 @@ def getStudio(a):
     patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
     pianshang = patherr.findall(a)
     if pianshang:
-        result = pianshang[0]
-    else:
-        result = ""
+        result = pianshang[0].strip()
+        if len(result):
+            return result
+    # 以卖家作为工作室
+    html = etree.fromstring(a, etree.HTMLParser())
+    try:
+        result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
+    except:
+        result = ''
     return result
 
 def getRuntime(a):
@@ -171,16 +177,13 @@ def getTrailer(htmlcode):  # 获取预告片
     return video_url
 
 def getExtrafanart(htmlcode):  # 获取剧照
-    html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>')
-    html = html_pather.search(htmlcode)
-    if html:
-        html = html.group()
-        extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"')
-        extrafanart_imgs = extrafanart_pather.findall(html)
-        if extrafanart_imgs:
-            return extrafanart_imgs
-    return ''
-
+    html = etree.fromstring(htmlcode, etree.HTMLParser())
+    result = []
+    try:
+        result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
+    except:
+        pass
+    return result
 def getCover(htmlcode):
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     try:
@@ -195,11 +198,13 @@ def getDirector(a):
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 def getOutline(number):  #获取剧情介绍
     try:
-        response = json.loads(airav.main(number))
-        result = response['outline']
+        htmlcode = get_html('https://cn.airav.wiki/video/' + number)
+        from WebCrawler.airav import getOutline as airav_getOutline
+        result = airav_getOutline(htmlcode)
         return result
     except:
-        return ''
+        pass
+    return ''
 def getSeries(a):
     #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
     html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -208,7 +213,7 @@ def getSeries(a):
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 
 def main(number):
-    javdb_site = random.choice(["javdb9", "javdb30"])
+    javdb_site = secrets.choice(["javdb9", "javdb30"])
     try:
         # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
         #     pass
@@ -303,8 +308,16 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
             'series': getSeries(detail_page),
 
         }
+        if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
+            dic['actor'].append('素人')
+            if not dic['series']:
+                dic['series'] = dic['studio']
+            if not dic['label']:
+                dic['label'] = dic['studio']
+
+
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         dic = {"title": ""}
     js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
@@ -316,7 +329,9 @@ if __name__ == "__main__":
     # print(main('blacked.20.05.30'))
     # print(main('AGAV-042'))
     # print(main('BANK-022'))
-    print(main('FC2-735670'))
-    print(main('FC2-1174949')) # not found
+    print(main('093021_539'))  # 没有剧照 片商pacopacomama
+    # print(main('FC2-2278260'))
+    # print(main('FC2-735670'))
+    # print(main('FC2-1174949')) # not found
     print(main('MVSD-439'))
-    print(main('EHM0001')) # not found
+    # print(main('EHM0001')) # not found
diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py
index 59f4572..8f58cb6 100644
--- a/WebCrawler/mgstage.py
+++ b/WebCrawler/mgstage.py
@@ -137,7 +137,7 @@ def main(number2):
             'series': getSeries(a),
         }
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         dic = {"title": ""}
 
diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py
index a7b4cff..858dd54 100644
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -224,7 +224,7 @@ def main(number):
             'series': getSeries(detail_page),
         }
     except Exception as e:
-        if config.Config().debug():
+        if config.getInstance().debug():
             print(e)
         dic = {"title": ""}
 

From 40d25d23f5e87a21189b97627c11e92c8d877484 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 12:17:12 +0800
Subject: [PATCH 12/56] =?UTF-8?q?ADC=5Ffunction.py:=E6=8D=A2=E8=A3=85getIn?=
 =?UTF-8?q?stance(),load=5Fcookies()=E6=94=B9=E7=94=A8pathlib?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ADC_function.py | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index b13d0b4..a11ef3b 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -2,7 +2,7 @@ from os import replace
 import requests
 import hashlib
 from pathlib import Path
-import random
+#import secrets
 import os.path
 import uuid
 import json
@@ -24,8 +24,8 @@ G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (K
 
 # 网页请求核心
 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
-    verify = config.Config().cacert_file()
-    configProxy = config.Config().proxy()
+    verify = config.getInstance().cacert_file()
+    configProxy = config.getInstance().proxy()
     errors = ""
 
     if ua is None:
@@ -61,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None)
 
 
 def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
-    configProxy = config.Config().proxy()
+    configProxy = config.getInstance().proxy()
     errors = ""
     headers_ua = {"User-Agent": G_USER_AGENT}
     if headers is None:
@@ -86,7 +86,7 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
 
 def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
     browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
-    configProxy = config.Config().proxy()
+    configProxy = config.getInstance().proxy()
     if configProxy.enable:
         browser.session.proxies = configProxy.proxies()
     result = browser.open(url)
@@ -107,7 +107,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
     browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
     if isinstance(cookies, dict):
         requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
-    configProxy = config.Config().proxy()
+    configProxy = config.getInstance().proxy()
     if configProxy.enable:
         browser.session.proxies = configProxy.proxies()
     result = browser.open(url)
@@ -131,7 +131,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
 
 # def get_javlib_cookie() -> [dict, str]:
 #     import cloudscraper
-#     switch, proxy, timeout, retry_count, proxytype = config.Config().proxy()
+#     switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy()
 #     proxies = get_proxy(proxy, proxytype)
 #
 #     raw_cookie = {}
@@ -158,7 +158,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
 
 
 def translateTag_to_sc(tag):
-    tranlate_to_sc = config.Config().transalte_to_sc()
+    tranlate_to_sc = config.getInstance().transalte_to_sc()
     if tranlate_to_sc:
         dict_gen = {'中文字幕': '中文字幕',
                     '高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头',
@@ -506,7 +506,7 @@ def translate(
 ):
     trans_result = ""
     if engine == "google-free":
-        gsite = config.Config().get_translate_service_site()
+        gsite = config.getInstance().get_translate_service_site()
         if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite):
             gsite = 'translate.google.cn'
         url = (
@@ -521,7 +521,7 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
         trans_result = trans_result.join(translate_list)
     # elif engine == "baidu":
     #     url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
-    #     salt = random.randint(1, 1435660288)
+    #     salt = secrets.randbelow(1435660287) + 1  # random.randint(1, 1435660288)
     #     sign = app_id + src + str(salt) + key
     #     sign = hashlib.md5(sign.encode()).hexdigest()
     #     url += (
@@ -564,7 +564,7 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
 def is_uncensored(number):
     if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper():
         return True
-    configs = config.Config().get_uncensored()
+    configs = config.getInstance().get_uncensored()
     prefix_list = str(configs).split(',')
     for pre in prefix_list:
         if pre.upper() in number.upper():
@@ -593,20 +593,20 @@ def load_cookies(filename):
     filename = os.path.basename(filename)
     if not len(filename):
         return None, None
-    path_search_order = [
-        f"./{filename}",
-        os.path.join(Path.home(), filename),
-        os.path.join(Path.home(), f".avdc/{filename}"),
-        os.path.join(Path.home(), f".local/share/avdc/{filename}")
-]
+    path_search_order = (
+        Path.cwd() / filename,
+        Path.home() / filename,
+        Path.home() / f".avdc/{filename}",
+        Path.home() / f".local/share/avdc/{filename}"
+    )
     cookies_filename = None
-    for p in path_search_order:
-        if os.path.exists(p):
-            cookies_filename = os.path.abspath(p)
-            break
-    if not cookies_filename:
-        return None, None
     try:
+        for p in path_search_order:
+            if p.is_file():
+                cookies_filename = str(p.resolve())
+                break
+        if not cookies_filename:
+            return None, None
         return json.load(open(cookies_filename)), cookies_filename
     except:
         return None, None

From b87206870be7ee9a56726e864c4ca5b3092eeff5 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 12:29:46 +0800
Subject: [PATCH 13/56] core.py:enhancement

---
 core.py | 77 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/core.py b/core.py
index 3ca9eb2..264d30b 100755
--- a/core.py
+++ b/core.py
@@ -3,8 +3,6 @@ import os.path
 import pathlib
 import re
 import shutil
-import platform
-import errno
 import sys
 
 from PIL import Image
@@ -33,7 +31,6 @@ def moveFailedFolder(filepath, conf):
         print("[-]Add to Failed List file, see '%s'" % ftxt)
         with open(ftxt, 'a', encoding='utf-8') as flt:
             flt.write(f'{filepath}\n')
-            flt.close()
     elif conf.failed_move() and not soft_link:
         failed_name = os.path.join(failed_folder, os.path.basename(filepath))
         mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt'))
@@ -41,8 +38,13 @@ def moveFailedFolder(filepath, conf):
         with open(mtxt, 'a', encoding='utf-8') as wwibbmt:
             tmstr = datetime.now().strftime("%Y-%m-%d %H:%M")
             wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n')
-            wwibbmt.close()
-        shutil.move(filepath, failed_name)
+        try:
+            if os.path.exists(failed_name):
+                print('[-]File Exists while moving to FailedFolder')
+                return
+            shutil.move(filepath, failed_name)
+        except:
+            print('[-]File Moving to FailedFolder unsuccessful!')
 
 
 def get_info(json_data):  # 返回json里的数据
@@ -224,7 +226,6 @@ def image_download(cover, number, leak_word, c_word, path, conf: config.Config,
 
 def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf):
     title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
-    failed_folder = conf.failed_folder()
     if conf.main_mode() == 3:  # 模式3下，由于视频文件不做任何改变，.nfo文件必须和视频文件名称除后缀外完全一致，KODI等软件方可支持
         nfo_path = str(Path(filepath).with_suffix('.nfo'))
     else:
@@ -236,6 +237,10 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
             except:
                 print(f"[-]Fatal error! can not make folder '{path}'")
                 sys.exit(0)
+
+        # KODI内查看影片信息时找不到number，配置naming_rule=number+'#'+title虽可解决
+        # 但使得标题太长，放入时常为空的outline内会更适合，软件给outline留出的显示版面也较大
+        outline = f"{number}#{outline}"
         with open(nfo_path, "wt", encoding='UTF-8') as code:
             print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
             print("<movie>", file=code)
@@ -287,7 +292,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
             print("  <num>" + number + "</num>", file=code)
             print("  <premiered>" + release + "</premiered>", file=code)
             print("  <cover>" + cover + "</cover>", file=code)
-            if config.Config().is_trailer():
+            if conf.is_trailer():
                 print("  <trailer>" + trailer + "</trailer>", file=code)
             print("  <website>" + website + "</website>", file=code)
             print("</movie>", file=code)
@@ -405,22 +410,30 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
     file_parent_origin_path = str(filepath_obj.parent)
     try:
         targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}")
+        # 任何情况下都不要覆盖，以免遭遇数据源或者引擎错误导致所有文件得到同一个number，逐一
+        # 同名覆盖致使全部文件损失且不可追回的最坏情况
+        if os.path.exists(targetpath):
+            raise FileExistsError('File Exists on destination path, we will never overwriting.')
         # 如果soft_link=1 使用软链接
         if conf.soft_link() == 0:
             shutil.move(filepath, targetpath)
         elif conf.soft_link() == 1:
-            # 采用相对路径，以便网络访问时能正确打开视频
-            filerelpath = os.path.relpath(filepath, path)
-            os.symlink(filerelpath, targetpath)
+            # 先尝试采用相对路径，以便网络访问时能正确打开视频，失败则可能是因为跨盘符等原因无法支持
+            # 相对路径径，改用绝对路径方式尝试建立软链接
+            try:
+                filerelpath = os.path.relpath(filepath, path)
+                os.symlink(filerelpath, targetpath)
+            except:
+                os.symlink(filepath_obj.resolve(), targetpath)
         elif conf.soft_link() == 2:
             shutil.move(filepath, targetpath)
             # 移走文件后，在原来位置增加一个可追溯的软链接，指向文件新位置
             # 以便追查文件从原先位置被移动到哪里了，避免因为得到错误番号后改名移动导致的文件失踪
-            # 便于手工找回文件。并将软连接文件名后缀修改，以避免再次被搜刮。
+            # 便于手工找回文件。由于目前软链接已经不会被刮削，文件名后缀无需再修改。
             targetabspath = os.path.abspath(targetpath)
             if targetabspath != os.path.abspath(filepath):
                 targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
-                os.symlink(targetrelpath, filepath + '#sym')
+                os.symlink(targetrelpath, filepath)
         sub_res = conf.sub_rule()
 
         for subname in sub_res:
@@ -430,9 +443,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
                 print('[+]Sub moved!')
                 return True
 
-    except FileExistsError:
-        print('[-]File Exists! Please check your movie!')
-        print('[-]move to the root folder of the program.')
+    except FileExistsError as fee:
+        print(f'[-]FileExistsError: {fee}')
+        moveFailedFolder(filepath, conf)
         return
     except PermissionError:
         print('[-]Error! Please run as administrator!')
@@ -448,11 +461,14 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
     filepath_obj = pathlib.Path(filepath)
     houzhui = filepath_obj.suffix
     file_parent_origin_path = str(filepath_obj.parent)
+    targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")
+    if os.path.exists(targetpath):
+        raise FileExistsError('File Exists on destination path, we will never overwriting.')
     try:
         if conf.soft_link():
-            os.symlink(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}"))
+            os.symlink(filepath, targetpath)
         else:
-            shutil.move(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}"))
+            shutil.move(filepath, targetpath)
 
         sub_res = conf.sub_rule()
         for subname in sub_res:
@@ -462,9 +478,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
                 print('[+]Sub moved!')
                 print('[!]Success')
                 return True
-    except FileExistsError:
-        print('[-]File Exists! Please check your movie!')
-        print('[-]move to the root folder of the program.')
+    except FileExistsError as fee:
+        print(f'[-]FileExistsError: {fee}')
         return
     except PermissionError:
         print('[-]Error! Please run as administrator!')
@@ -594,17 +609,18 @@ def core_main(file_path, number_th, conf: config.Config):
         # 裁剪图
         cutImage(imagecut, path, number, leak_word, c_word)
 
-        # 打印文件
-        print_files(path, leak_word, c_word,  json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag,  json_data.get('actor_list'), liuchu, uncensored, conf)
-
-        # 移动文件
-        paste_file_to_folder(filepath, path, number, leak_word, c_word, conf)
-
+        # 添加水印
         poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
         thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
         if conf.is_watermark():
             add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
 
+        # 移动电影
+        paste_file_to_folder(filepath, path, number, leak_word, c_word, conf)
+
+        # 最后输出.nfo元数据文件，以完成.nfo文件创建作为任务成功标志
+        print_files(path, leak_word, c_word,  json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag,  json_data.get('actor_list'), liuchu, uncensored, conf)
+
     elif conf.main_mode() == 2:
         # 创建文件夹
         path = create_folder(json_data, conf)
@@ -639,11 +655,12 @@ def core_main(file_path, number_th, conf: config.Config):
         # 裁剪图
         cutImage(imagecut, path, number, leak_word, c_word)
 
-        # 打印文件
-        print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
-                    tag, json_data.get('actor_list'), liuchu, uncensored, conf)
-
+        # 添加水印
         poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
         thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
         if conf.is_watermark():
             add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+
+        # 最后输出.nfo元数据文件，以完成.nfo文件创建作为任务成功标志
+        print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
+                    tag, json_data.get('actor_list'), liuchu, uncensored, conf)

From 8ab736e4fabfee5f4022170b54743543e63db8ca Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 13:02:52 +0800
Subject: [PATCH 14/56] AV_Data_Capture.py:command params new add -m -d -c -i
 -g -z

---
 ADC_function.py    |   8 --
 AV_Data_Capture.py | 205 +++++++++++++++++++++++++++++----------------
 2 files changed, 135 insertions(+), 78 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index a11ef3b..e755fb5 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -622,11 +622,3 @@ def file_modification_days(filename) -> int:
     if days < 0:
         return 9999
     return days
-
-# 检查文件是否是链接
-def is_link(filename: str):
-    if os.path.islink(filename):
-        return True # symlink
-    elif os.stat(filename).st_nlink > 1:
-        return True # hard link Linux MAC OSX Windows NTFS
-    return False
diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 6ab00ad..9b75f50 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -11,7 +11,7 @@ import config
 from datetime import datetime, timedelta
 import time
 from pathlib import Path
-from ADC_function import  file_modification_days, get_html, is_link
+from ADC_function import  file_modification_days, get_html
 from number_parser import get_number
 from core import core_main, moveFailedFolder
 
@@ -35,25 +35,48 @@ def check_update(local_version):
 
 
 def argparse_function(ver: str) -> typing.Tuple[str, str, bool]:
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    conf = config.getInstance()
+    parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
     parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
     parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.")
-    # parser.add_argument("-c", "--config", default='config.ini', nargs='?', help="The config file Path.")
-    default_logdir = os.path.join(Path.home(),'.avlogs')
+    parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
+    parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
+    # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
+    default_logdir = Path.home() / '.avlogs'
     parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?',
-        help=f"""Duplicate stdout and stderr to logfiles
-in logging folder, default on.
-default for current user: {default_logdir}
-Use --log-dir= to turn off logging feature.""")
-    parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number")
-    parser.add_argument("-a", "--auto-exit", dest='autoexit', action="store_true",
-                        help="Auto exit after program complete")
+        help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
+        default folder for current user: '{default_logdir}'. Change default folder to an empty file,
+        or use --log-dir= to turn log off.""")
     parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.")
+    parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.")
+    parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.")
+    parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
+                         os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
+    parser.add_argument("-a", "--auto-exit", action="store_true",
+                        help="Auto exit after program complete")
+    parser.add_argument("-g","--debug", action="store_true",
+                        help="Turn on debug mode to generate diagnostic log for issue report.")
+    parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true",
+                        help="""Only show job list of files and numbers, and **NO** actual operation
+is performed. It may help you correct wrong numbers before real job.""")
     parser.add_argument("-v", "--version", action="version", version=ver)
+    #ini_path
     args = parser.parse_args()
+    def get_natural_number_or_none(value):
+        return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None
+    def get_str_or_none(value):
+        return value if isinstance(value, str) and len(value) else None
+    def get_bool_or_none(value):
+        return True if isinstance(value, bool) and value else None
+    config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode)
+    config.G_conf_override["common:source_folder"] = get_str_or_none(args.path)
+    config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit)
+    config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days)
+    config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt)
+    config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list)
+    config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug)
 
-    return args.file, args.path, args.number, args.autoexit, args.logdir, args.regexstr
-
+    return args.file, args.number, args.logdir, args.regexstr, args.zero_op
 
 class OutLogger(object):
     def __init__(self, logfile) -> None:
@@ -200,15 +223,14 @@ def close_logfile(logdir: str):
     # 100MB的日志文件能缩小到3.7MB。
 
 
-# 重写视频文件扫描，消除递归，取消全局变量，新增失败文件列表跳过处理
-def movie_lists(root, conf, regexstr):
-    escape_folder = re.split("[,，]", conf.escape_folder())
+# 新增失败文件列表跳过处理，及.nfo修改天数跳过处理，提示跳过视频总数，调试模式(-g)下详细被跳过文件，跳过小广告
+def movie_lists(source_folder, regexstr):
+    conf = config.getInstance()
     main_mode = conf.main_mode()
     debug = conf.debug()
     nfo_skip_days = conf.nfo_skip_days()
     soft_link = conf.soft_link()
-    total = []
-    file_type = conf.media_type().upper().split(",")
+    file_type = conf.media_type().lower().split(",")
     trailerRE = re.compile(r'-trailer\.', re.IGNORECASE)
     cliRE = None
     if isinstance(regexstr, str) and len(regexstr):
@@ -216,61 +238,85 @@ def movie_lists(root, conf, regexstr):
             cliRE = re.compile(regexstr, re.IGNORECASE)
         except:
             pass
+    failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt'
     failed_set = set()
-    if main_mode == 3 or soft_link:
+    if (main_mode == 3 or soft_link) and not conf.ignore_failed_list():
         try:
-            with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'r', encoding='utf-8')  as flt:
+            with open(failed_list_txt_path, 'r', encoding='utf-8')  as flt:
                 flist = flt.read().splitlines()
                 failed_set = set(flist)
-                flt.close()
             if len(flist) != len(failed_set):
-                with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'w', encoding='utf-8')  as flt:
-                    flt.writelines([line + '\n' for line in failed_set])
-                    flt.close()
+                with open(failed_list_txt_path, 'w', encoding='utf-8')  as flt:
+                    wtlines = [line + '\n' for line in failed_set]
+                    wtlines.sort()
+                    flt.writelines(wtlines)
         except:
             pass
-    for current_dir, subdirs, files in os.walk(root, topdown=False):
-        if len(set(current_dir.replace("\\","/").split("/")) & set(escape_folder)) > 0:
+    if not Path(source_folder).is_dir():
+        print('[-]Source folder not found!')
+        return []
+    total = []
+    source = Path(source_folder).resolve()
+    skip_failed_cnt, skip_nfo_days_cnt = 0, 0
+    escape_folder_set = set(re.split("[,，]", conf.escape_folder()))
+    for full_name in source.glob(r'**/*'):
+        if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
             continue
-        for f in files:
-            full_name = os.path.join(current_dir, f)
-            if not os.path.splitext(full_name)[1].upper() in file_type:
-                continue
-            absf = os.path.abspath(full_name)
-            if absf in failed_set:
-                if debug:
-                    print('[!]Skip failed file:', absf)
-                continue
-            if cliRE and not cliRE.search(absf):
-                continue
-            if main_mode == 3 and nfo_skip_days > 0:
-                nfo = Path(absf).with_suffix('.nfo')
-                if file_modification_days(nfo) <= nfo_skip_days:
-                    continue
-            if (main_mode == 3 or not is_link(absf)) and not trailerRE.search(f):
-                total.append(absf)
+        if not full_name.suffix.lower() in file_type:
+            continue
+        absf = str(full_name)
+        if absf in failed_set:
+            skip_failed_cnt += 1
+            if debug:
+                print('[!]Skip failed movie:', absf)
+            continue
+        is_sym = full_name.is_symlink()
+        if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1):  # 短路布尔 符号链接不取stat()，因为符号链接可能指向不存在目标
+            continue # file is symlink or hardlink(Linux/NTFS/Darwin)
+        # 调试用0字节样本允许通过，去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
+        movie_size = 0 if is_sym else full_name.stat().st_size  # 同上 符号链接不取stat()及st_size，直接赋0跳过小视频检测
+        if movie_size > 0 and movie_size < 125829120:  # 1024*1024*120=125829120
+            continue
+        if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
+            continue
+        if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days:
+            skip_nfo_days_cnt += 1
+            if debug:
+                print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
+            continue
+        total.append(absf)
+
+    if skip_failed_cnt:
+        print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.")
+    if skip_nfo_days_cnt:
+        print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
     if nfo_skip_days <= 0 or not soft_link or main_mode == 3:
         return total
     # 软连接方式，已经成功削刮的也需要从成功目录中检查.nfo更新天数，跳过N天内更新过的
     skip_numbers = set()
-    success_folder = conf.success_folder()
-    for current_dir, subdirs, files in os.walk(success_folder, topdown=False):
-        for f in files:
-            f_obj = Path(f)
-            if f_obj.suffix.lower() != '.nfo':
-                continue
-            if file_modification_days(Path(current_dir) / f_obj) > nfo_skip_days:
-                continue
-            number = get_number(False, f_obj.stem)
-            if number:
-                skip_numbers.add(number.upper())
+    success_folder = Path(conf.success_folder()).resolve()
+    for f in success_folder.glob(r'**/*'):
+        if not re.match(r'\.nfo', f.suffix, re.IGNORECASE):
+            continue
+        if file_modification_days(f) > nfo_skip_days:
+            continue
+        number = get_number(False, f.stem)
+        if not number:
+            continue
+        skip_numbers.add(number.lower())
+
     rm_list = []
     for f in total:
         n_number = get_number(False, os.path.basename(f))
-        if n_number and n_number.upper() in skip_numbers:
+        if n_number and n_number.lower() in skip_numbers:
             rm_list.append(f)
     for f in rm_list:
         total.remove(f)
+        if debug:
+            print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'")
+    if len(rm_list):
+        print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.")
+
     return total
 
 
@@ -299,14 +345,18 @@ def rm_empty_folder(path):
             pass
 
 
-def create_data_and_move(file_path: str, c: config.Config, debug):
+def create_data_and_move(file_path: str, zero_op):
     # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
+    c = config.getInstance()
+    debug = c.debug()
     file_name = os.path.basename(file_path)
     n_number = get_number(debug, file_name)
     file_path = os.path.abspath(file_path)
 
     if debug == True:
         print(f"[!] [{n_number}] As Number making data for '{file_path}'")
+        if zero_op:
+            return
         if n_number:
             core_main(file_path, n_number, c)
         else:
@@ -315,6 +365,8 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
     else:
         try:
             print(f"[!] [{n_number}] As Number making data for '{file_path}'")
+            if zero_op:
+                return
             if n_number:
                 core_main(file_path, n_number, c)
             else:
@@ -357,8 +409,17 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
 if __name__ == '__main__':
     version = '5.0.1'
     urllib3.disable_warnings() #Ignore http proxy warning
+
+    # Read config.ini first, in argparse_function() need conf.failed_folder()
+    conf = config.Config("config.ini")
+
     # Parse command line args
-    single_file_path, folder_path, custom_number, auto_exit, logdir, regexstr = argparse_function(version)
+    single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version)
+
+    main_mode = conf.main_mode()
+    if not main_mode in (1, 2, 3):
+        print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
+        sys.exit(4)
 
     dupe_stdout_to_logfile(logdir)
 
@@ -368,9 +429,8 @@ if __name__ == '__main__':
     print('[*]======================================================')
     print('[*]严禁在墙内宣传本项目')
 
-    # Read config.ini
-    conf = config.Config("config.ini")
-
+    start_time = time.time()
+    print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
 
     if conf.update_check():
         check_update(version)
@@ -382,9 +442,15 @@ if __name__ == '__main__':
         print('[!]Enable soft link')
     if len(sys.argv)>1:
         print('[!]CmdLine:'," ".join(sys.argv[1:]))
+    print('[+]Main Working mode ## {}: {} ## {}{}{}'
+        .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1],
+        "" if not conf.multi_threading() else ", multi_threading on",
+        "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}",
+        "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}"
+        ) if not single_file_path else ('-','Single File', '','',''))
+    )
 
     create_failed_folder(conf.failed_folder())
-    start_time = time.time()
 
     if not single_file_path == '': #Single File
         print('[+]==================== Single File =====================')
@@ -393,32 +459,31 @@ if __name__ == '__main__':
         else:
             create_data_and_move_with_custom_number(single_file_path, conf, custom_number)
     else:
-        if folder_path == '':
+        folder_path = conf.source_folder()
+        if not isinstance(folder_path, str) or folder_path == '':
             folder_path = os.path.abspath(".")
 
-        movie_list = movie_lists(folder_path, conf, regexstr)
+        movie_list = movie_lists(folder_path, regexstr)
 
         count = 0
         count_all = str(len(movie_list))
-        print('[+]Find', count_all, 'movies. Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
-        main_mode = conf.main_mode()
+        print('[+]Find', count_all, 'movies.')
         stop_count = conf.stop_counter()
         if stop_count<1:
             stop_count = 999999
         else:
             count_all = str(min(len(movie_list), stop_count))
-        if main_mode == 3:
-            print(f'[!]运行模式：**维护模式**，本程序将在处理{count_all}个视频文件后停止，如需后台执行自动退出请结合 -a 参数。')
+
         for movie_path in movie_list:  # 遍历电影列表 交给core处理
             count = count + 1
             percentage = str(count / int(count_all) * 100)[:4] + '%'
             print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
-            create_data_and_move(movie_path, conf, conf.debug())
+            create_data_and_move(movie_path, zero_op)
             if count >= stop_count:
                 print("[!]Stop counter triggered!")
                 break
 
-    if conf.del_empty_folder():
+    if conf.del_empty_folder() and not zero_op:
         rm_empty_folder(conf.success_folder())
         rm_empty_folder(conf.failed_folder())
         if len(folder_path):
@@ -433,7 +498,7 @@ if __name__ == '__main__':
 
     close_logfile(logdir)
 
-    if not (conf.auto_exit() or auto_exit):
+    if not conf.auto_exit():
         input("Press enter key exit, you can check the error message before you exit...")
 
     sys.exit(0)

From 35c4bf85ae795785dd56490cfe78d979c2f2449a Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 8 Oct 2021 16:01:31 +0800
Subject: [PATCH 15/56] argparse:need str as default value type

---
 AV_Data_Capture.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 9b75f50..4411538 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -42,7 +42,7 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, bool]:
     parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
     parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
     # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
-    default_logdir = Path.home() / '.avlogs'
+    default_logdir = str(Path.home() / '.avlogs')
     parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?',
         help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
         default folder for current user: '{default_logdir}'. Change default folder to an empty file,

From 288acfb264c7ad1d1ffd3fe1d96b4071ae65b836 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 05:28:44 +0800
Subject: [PATCH 16/56] =?UTF-8?q?=E4=B8=8D=E4=BC=9A=E9=80=A0=E6=88=90bug?=
 =?UTF-8?q?=EF=BC=8C=E4=BD=86=E8=BF=98=E6=98=AF=E6=94=B9=E4=B8=80=E4=B8=8B?=
 =?UTF-8?q?=E5=A5=BD=E4=B8=80=E4=BA=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AV_Data_Capture.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 4411538..5def067 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -177,12 +177,12 @@ def close_logfile(logdir: str):
     # rm -rf $LOGDIR
     """
     # 第一步，合并到月
+    today = datetime.today()
     for i in range(1):  # 利用1次循环的break跳到第二步，避免大块if缩进或者使用goto语法
         txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{8}T\d{6}', f.stem, re.A)]
         if not txts or not len(txts):
             break
         txts.sort()
-        today = datetime.today()
         tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32T")
         deadline_month = f'avdc_{tmstr_3_month_ago}'
         month_merge = [f for f in txts if f.stem < deadline_month]

From 890452bffd9a978f0dc067428cc1d6b6bdd44ab3 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 09:07:38 +0800
Subject: [PATCH 17/56] =?UTF-8?q?=E8=A1=A5=E4=B8=8A=E6=BC=8F=E6=8E=89?=
 =?UTF-8?q?=E6=B2=A1=E6=9B=B4=E6=96=B0=E7=9A=84config=E6=89=93=E5=8C=85?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC=E9=83=A8=E5=88=86=EF=BC=8C=E5=85=88=E5=89=8D?=
 =?UTF-8?q?=E8=A2=AB=E6=88=91=E7=9A=84WinMerge=20filter=E8=A7=84=E5=88=99?=
 =?UTF-8?q?=E8=BF=87=E6=BB=A4=E6=8E=89=E4=BA=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/main.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6b0a748..289c88e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -33,7 +33,7 @@ jobs:
       - name: Test number_perser.get_number
         run: |
           python number_parser.py -v
-          
+
       - name: Build with PyInstaller for macos/ubuntu
         if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest'
         run: |
@@ -42,6 +42,8 @@ jobs:
             --hidden-import ADC_function.py \
             --hidden-import core.py \
             --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
+            --add-data "Img:Img" \
+            --add-data "config.ini:." \
 
       - name: Build with PyInstaller for windows
         if: matrix.os == 'windows-latest'
@@ -51,6 +53,8 @@ jobs:
             --hidden-import ADC_function.py `
             --hidden-import core.py `
             --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
+            --add-data "Img;Img" `
+            --add-data "config.ini;." `
 
       - name: Copy config.ini
         run: |

From f60166922984c1e9f4cbc608ca43c8ba52e30d77 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 12:23:00 +0800
Subject: [PATCH 18/56] javdb:change to site 31 and 32

---
 WebCrawler/javdb.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 756be1c..3a0a18d 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -213,14 +213,16 @@ def getSeries(a):
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 
 def main(number):
-    javdb_site = secrets.choice(["javdb9", "javdb30"])
+    javdb_site = secrets.choice(["javdb31", "javdb32"])
+    if config.getInstance().debug():
+        print(f'[!]javdb:select site {javdb_site}')
     try:
         # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
         #     pass
         # else:
         #     number = number.upper()
         number = number.upper()
-        cookie_json = './' + javdb_site + '.json'
+        cookie_json = javdb_site + '.json'
         javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
         # 不加载过期的cookie，javdb登录界面显示为7天免登录，故假定cookie有效期为7天
         cookies_dict, cookies_filepath = load_cookies(cookie_json)
@@ -326,6 +328,7 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
 # main('DV-1562')
 # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束，你可以在结束之前查看和错误信息。")
 if __name__ == "__main__":
+    config.G_conf_override['debug_mode:switch'] = True
     # print(main('blacked.20.05.30'))
     # print(main('AGAV-042'))
     # print(main('BANK-022'))

From bd3504f3b5ed200ab79e2783f7594ee35c4ffb53 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 19:32:00 +0800
Subject: [PATCH 19/56] javdb:only accept one login site after javdb site
 update

---
 WebCrawler/javdb.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 3a0a18d..841d8d6 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -213,27 +213,31 @@ def getSeries(a):
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
 
 def main(number):
-    javdb_site = secrets.choice(["javdb31", "javdb32"])
-    if config.getInstance().debug():
-        print(f'[!]javdb:select site {javdb_site}')
+    # javdb更新后同一时间只能登录一个数字站，最新登录站会踢出旧的登录，因此按找到的第一个javdb*.json文件选择站点，
+    # 如果无.json文件则按选择最后一个站点。
+    javdb_sites = ["javdb31", "javdb32"]
+    debug =  config.getInstance().debug()
     try:
         # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
         #     pass
         # else:
         #     number = number.upper()
         number = number.upper()
-        cookie_json = javdb_site + '.json'
         javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
         # 不加载过期的cookie，javdb登录界面显示为7天免登录，故假定cookie有效期为7天
-        cookies_dict, cookies_filepath = load_cookies(cookie_json)
-        if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
-            cdays = file_modification_days(cookies_filepath)
-            if cdays < 7:
-                javdb_cookies = cookies_dict
-            elif cdays != 9999:
-                print(
-f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
-
+        for cj in javdb_sites:
+            javdb_site = cj
+            cookie_json = javdb_site + '.json'
+            cookies_dict, cookies_filepath = load_cookies(cookie_json)
+            if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
+                cdays = file_modification_days(cookies_filepath)
+                if cdays < 7:
+                    javdb_cookies = cookies_dict
+                elif cdays != 9999:
+                    print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
+                break
+        if debug:
+            print(f'[!]javdb:select site {javdb_site}')
         try:
             javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
             query_result = get_html(javdb_url, cookies=javdb_cookies)
@@ -338,3 +342,4 @@ if __name__ == "__main__":
     # print(main('FC2-1174949')) # not found
     print(main('MVSD-439'))
     # print(main('EHM0001')) # not found
+    print(main('FC2-2314275'))

From 3873d1aa4cabb0eb690b2cd6f50a7f1eb181c07b Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 19:37:40 +0800
Subject: [PATCH 20/56] update user agent

---
 ADC_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ADC_function.py b/ADC_function.py
index e755fb5..09fb11d 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -20,7 +20,7 @@ def getXpathSingle(htmlcode, xpath):
     return result1
 
 
-G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
+G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
 
 # 网页请求核心
 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):

From d010ea6d517e74895c40543fbf4decd05dad7f2d Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 19:42:11 +0800
Subject: [PATCH 21/56] =?UTF-8?q?=E6=B8=85=E7=90=86=E5=85=A8=E9=83=A8conf?=
 =?UTF-8?q?=E7=A9=BF=E6=A2=AD=E5=8F=82=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AV_Data_Capture.py     |  23 ++++---
 WebCrawler/__init__.py |   3 +-
 core.py                | 139 +++++++++++++++++++++--------------------
 3 files changed, 85 insertions(+), 80 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 5def067..8e1cb76 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -347,10 +347,8 @@ def rm_empty_folder(path):
 
 def create_data_and_move(file_path: str, zero_op):
     # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
-    c = config.getInstance()
-    debug = c.debug()
-    file_name = os.path.basename(file_path)
-    n_number = get_number(debug, file_name)
+    debug = config.getInstance().debug()
+    n_number = get_number(debug, os.path.basename(file_path))
     file_path = os.path.abspath(file_path)
 
     if debug == True:
@@ -358,7 +356,7 @@ def create_data_and_move(file_path: str, zero_op):
         if zero_op:
             return
         if n_number:
-            core_main(file_path, n_number, c)
+            core_main(file_path, n_number)
         else:
             print("[-] number empty ERROR")
         print("[*]======================================================")
@@ -368,7 +366,7 @@ def create_data_and_move(file_path: str, zero_op):
             if zero_op:
                 return
             if n_number:
-                core_main(file_path, n_number, c)
+                core_main(file_path, n_number)
             else:
                 raise ValueError("number empty")
             print("[*]======================================================")
@@ -377,17 +375,18 @@ def create_data_and_move(file_path: str, zero_op):
             print('[-]', err)
 
             try:
-                moveFailedFolder(file_path, conf)
+                moveFailedFolder(file_path)
             except Exception as err:
                 print('[!]', err)
 
 
-def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number):
+def create_data_and_move_with_custom_number(file_path: str, custom_number):
+    conf = config.getInstance()
     file_name = os.path.basename(file_path)
     try:
         print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
         if custom_number:
-            core_main(file_path, custom_number, c)
+            core_main(file_path, custom_number)
         else:
             print("[-] number empty ERROR")
         print("[*]======================================================")
@@ -395,7 +394,7 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
         print("[-] [{}] ERROR:".format(file_path))
         print('[-]', err)
 
-        if c.soft_link():
+        if conf.soft_link():
             print("[-]Link {} to failed folder".format(file_path))
             os.symlink(file_path, os.path.join(conf.failed_folder(), file_name))
         else:
@@ -455,9 +454,9 @@ if __name__ == '__main__':
     if not single_file_path == '': #Single File
         print('[+]==================== Single File =====================')
         if custom_number == '':
-            create_data_and_move_with_custom_number(single_file_path, conf, get_number(conf.debug(), os.path.basename(single_file_path)))
+            create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)))
         else:
-            create_data_and_move_with_custom_number(single_file_path, conf, custom_number)
+            create_data_and_move_with_custom_number(single_file_path, custom_number)
     else:
         folder_path = conf.source_folder()
         if not isinstance(folder_path, str) or folder_path == '':
diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index dc54b46..c5d02b5 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool:  # 元数据获取失败检测
 
     return True
 
-def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数据
+def get_data_from_json(file_number):  # 从JSON返回元数据
     """
     iterate through all services and fetch the data
     """
@@ -53,6 +53,7 @@ def get_data_from_json(file_number, conf: config.Config):  # 从JSON返回元数
         "fc2club": fc2club.main
     }
 
+    conf = config.getInstance()
     # default fetch order list, from the beginning to the end
     sources = conf.sources().split(',')
     if not len(conf.sources()) > 80:
diff --git a/core.py b/core.py
index 264d30b..94a8503 100755
--- a/core.py
+++ b/core.py
@@ -21,7 +21,8 @@ def escape_path(path, escape_literals: str):  # Remove escape literals
     return path
 
 
-def moveFailedFolder(filepath, conf):
+def moveFailedFolder(filepath):
+    conf = config.getInstance()
     failed_folder = conf.failed_folder()
     soft_link = conf.soft_link()
     # 模式3或软连接，改为维护一个失败列表，启动扫描时加载用于排除该路径，以免反复处理
@@ -65,14 +66,15 @@ def get_info(json_data):  # 返回json里的数据
     return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label
 
 
-def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config.Config, filepath):
+def small_cover_check(path, number, cover_small, leak_word, c_word, filepath):
     filename = f"{number}{leak_word}{c_word}-poster.jpg"
-    download_file_with_filename(cover_small, filename, path, conf, filepath)
+    download_file_with_filename(cover_small, filename, path, filepath)
     print('[+]Image Downloaded! ' + os.path.join(path, filename))
 
 
-def create_folder(json_data, conf: config.Config):  # 创建文件夹
+def create_folder(json_data):  # 创建文件夹
     title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
+    conf = config.getInstance()
     success_folder = conf.success_folder()
     actor = json_data.get('actor')
     location_rule = eval(conf.location_rule(), json_data)
@@ -104,7 +106,8 @@ def create_folder(json_data, conf: config.Config):  # 创建文件夹
 # =====================资源下载部分===========================
 
 # path = examle:photo , video.in the Project Folder!
-def download_file_with_filename(url, filename, path, conf: config.Config, filepath):
+def download_file_with_filename(url, filename, path, filepath):
+    conf = config.getInstance()
     configProxy = conf.proxy()
 
     for i in range(configProxy.retry):
@@ -156,20 +159,20 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
             print('[-]Image Download :  Connect retry ' + str(i) + '/' + str(configProxy.retry))
         except IOError:
             print(f"[-]Create Directory '{path}' failed!")
-            moveFailedFolder(filepath, conf)
+            moveFailedFolder(filepath)
             return
     print('[-]Connect Failed! Please check your Proxy or Network!')
-    moveFailedFolder(filepath, conf)
+    moveFailedFolder(filepath)
     return
 
-def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: config.Config):
-    if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) == 'failed':
+def trailer_download(trailer, leak_word, c_word, number, path, filepath):
+    if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) == 'failed':
         return
-    configProxy = conf.proxy()
+    configProxy = config.getInstance().proxy()
     for i in range(configProxy.retry):
         if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0:
             print('[!]Video Download Failed! Trying again. [{}/3]', i + 1)
-            download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath)
+            download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath)
             continue
         else:
             break
@@ -178,20 +181,20 @@ def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: c
     print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4')
 
 # 剧照下载成功，否则移动到failed
-def extrafanart_download(data, path, conf: config.Config, filepath):
+def extrafanart_download(data, path, filepath):
     j = 1
-    path = os.path.join(path, conf.get_extrafanart())
+    path = os.path.join(path, config.getInstance().get_extrafanart())
+    configProxy = config.getInstance().proxy()
     for url in data:
         jpg_filename = f'extrafanart-{j}.jpg'
         jpg_fullpath = os.path.join(path, jpg_filename)
-        if download_file_with_filename(url, jpg_filename, path, conf, filepath) == 'failed':
-            moveFailedFolder(filepath, conf)
+        if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed':
+            moveFailedFolder(filepath)
             return
-        configProxy = conf.proxy()
         for i in range(configProxy.retry):
             if os.path.getsize(jpg_fullpath) == 0:
                 print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
-                download_file_with_filename(url, jpg_filename, path, conf, filepath)
+                download_file_with_filename(url, jpg_filename, path, filepath)
                 continue
             else:
                 break
@@ -203,18 +206,18 @@ def extrafanart_download(data, path, conf: config.Config, filepath):
 
 
 # 封面是否下载成功，否则移动到failed
-def image_download(cover, number, leak_word, c_word, path, conf: config.Config, filepath):
+def image_download(cover, number, leak_word, c_word, path, filepath):
     filename = f"{number}{leak_word}{c_word}-fanart.jpg"
     full_filepath = os.path.join(path, filename)
-    if download_file_with_filename(cover, filename, path, conf, filepath) == 'failed':
-        moveFailedFolder(filepath, conf)
+    if download_file_with_filename(cover, filename, path, filepath) == 'failed':
+        moveFailedFolder(filepath)
         return
 
-    configProxy = conf.proxy()
+    configProxy = config.getInstance().proxy()
     for i in range(configProxy.retry):
         if os.path.getsize(full_filepath) == 0:
             print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
-            download_file_with_filename(cover, filename, path, conf, filepath)
+            download_file_with_filename(cover, filename, path, filepath)
             continue
         else:
             break
@@ -224,9 +227,9 @@ def image_download(cover, number, leak_word, c_word, path, conf: config.Config,
     shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg"))
 
 
-def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf):
+def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored):
     title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
-    if conf.main_mode() == 3:  # 模式3下，由于视频文件不做任何改变，.nfo文件必须和视频文件名称除后缀外完全一致，KODI等软件方可支持
+    if config.getInstance().main_mode() == 3:  # 模式3下，由于视频文件不做任何改变，.nfo文件必须和视频文件名称除后缀外完全一致，KODI等软件方可支持
         nfo_path = str(Path(filepath).with_suffix('.nfo'))
     else:
         nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo")
@@ -292,7 +295,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
             print("  <num>" + number + "</num>", file=code)
             print("  <premiered>" + release + "</premiered>", file=code)
             print("  <cover>" + cover + "</cover>", file=code)
-            if conf.is_trailer():
+            if config.getInstance().is_trailer():
                 print("  <trailer>" + trailer + "</trailer>", file=code)
             print("  <website>" + website + "</website>", file=code)
             print("</movie>", file=code)
@@ -300,12 +303,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
     except IOError as e:
         print("[-]Write Failed!")
         print("[-]", e)
-        moveFailedFolder(filepath, conf)
+        moveFailedFolder(filepath)
         return
     except Exception as e1:
         print("[-]Write Failed!")
         print("[-]", e1)
-        moveFailedFolder(filepath, conf)
+        moveFailedFolder(filepath)
         return
 
 
@@ -334,7 +337,7 @@ def cutImage(imagecut, path, number, leak_word, c_word):
 # leak     流出     参数值为 1   0
 # uncensored 无码   参数值为 1   0
 # ========================================================================加水印
-def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Config):
+def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored):
     mark_type = ''
     if cn_sub:
         mark_type += ',字幕'
@@ -344,17 +347,17 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Conf
         mark_type += ',无码'
     if mark_type == '':
         return
-    add_mark_thread(thumb_path, cn_sub, leak, uncensored, conf)
+    add_mark_thread(thumb_path, cn_sub, leak, uncensored)
     print('[+]Thumb Add Mark:   ' + mark_type.strip(','))
-    add_mark_thread(poster_path, cn_sub, leak, uncensored, conf)
+    add_mark_thread(poster_path, cn_sub, leak, uncensored)
     print('[+]Poster Add Mark:  ' + mark_type.strip(','))
 
-def add_mark_thread(pic_path, cn_sub, leak, uncensored, conf):
+def add_mark_thread(pic_path, cn_sub, leak, uncensored):
     size = 14
     img_pic = Image.open(pic_path)
     # 获取自定义位置，取余配合pos达到顺时针添加的效果
     # 左上 0, 右上 1, 右下 2， 左下 3
-    count = conf.watermark_type()
+    count = config.getInstance().watermark_type()
     if cn_sub == 1 or cn_sub == '1':
         add_to_pic(pic_path, img_pic, size, count, 1)  # 添加
         count = (count + 1) % 4
@@ -404,7 +407,7 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
     img_pic.save(pic_path, quality=95)
 # ========================结束=================================
 
-def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config.Config):  # 文件路径，番号，后缀，要移动至的位置
+def paste_file_to_folder(filepath, path, number, leak_word, c_word):  # 文件路径，番号，后缀，要移动至的位置
     filepath_obj = pathlib.Path(filepath)
     houzhui = filepath_obj.suffix
     file_parent_origin_path = str(filepath_obj.parent)
@@ -414,10 +417,11 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
         # 同名覆盖致使全部文件损失且不可追回的最坏情况
         if os.path.exists(targetpath):
             raise FileExistsError('File Exists on destination path, we will never overwriting.')
+        soft_link = config.getInstance().soft_link()
         # 如果soft_link=1 使用软链接
-        if conf.soft_link() == 0:
+        if soft_link == 0:
             shutil.move(filepath, targetpath)
-        elif conf.soft_link() == 1:
+        elif soft_link == 1:
             # 先尝试采用相对路径，以便网络访问时能正确打开视频，失败则可能是因为跨盘符等原因无法支持
             # 相对路径径，改用绝对路径方式尝试建立软链接
             try:
@@ -425,7 +429,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
                 os.symlink(filerelpath, targetpath)
             except:
                 os.symlink(filepath_obj.resolve(), targetpath)
-        elif conf.soft_link() == 2:
+        elif soft_link == 2:
             shutil.move(filepath, targetpath)
             # 移走文件后，在原来位置增加一个可追溯的软链接，指向文件新位置
             # 以便追查文件从原先位置被移动到哪里了，避免因为得到错误番号后改名移动导致的文件失踪
@@ -434,7 +438,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
             if targetabspath != os.path.abspath(filepath):
                 targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
                 os.symlink(targetrelpath, filepath)
-        sub_res = conf.sub_rule()
+        sub_res = config.getInstance().sub_rule()
 
         for subname in sub_res:
             sub_filepath = str(filepath_obj.with_suffix(subname))
@@ -445,7 +449,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
 
     except FileExistsError as fee:
         print(f'[-]FileExistsError: {fee}')
-        moveFailedFolder(filepath, conf)
+        moveFailedFolder(filepath)
         return
     except PermissionError:
         print('[-]Error! Please run as administrator!')
@@ -455,7 +459,7 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
         return
 
 
-def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf):  # 文件路径，番号，后缀，要移动至的位置
+def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word):  # 文件路径，番号，后缀，要移动至的位置
     if multi_part == 1:
         number += part  # 这时number会被附加上CD1后缀
     filepath_obj = pathlib.Path(filepath)
@@ -465,12 +469,12 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
     if os.path.exists(targetpath):
         raise FileExistsError('File Exists on destination path, we will never overwriting.')
     try:
-        if conf.soft_link():
+        if config.getInstance().soft_link():
             os.symlink(filepath, targetpath)
         else:
             shutil.move(filepath, targetpath)
 
-        sub_res = conf.sub_rule()
+        sub_res = config.getInstance().sub_rule()
         for subname in sub_res:
             sub_filepath = str(filepath_obj.with_suffix(subname))
             if os.path.isfile(sub_filepath):  # 字幕移动
@@ -488,7 +492,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
         print(f'[-]OS Error errno  {oserr.errno}')
         return
 
-def get_part(filepath, conf):
+def get_part(filepath):
     try:
         if re.search('-CD\d+', filepath):
             return re.findall('-CD\d+', filepath)[0]
@@ -496,7 +500,7 @@ def get_part(filepath, conf):
             return re.findall('-cd\d+', filepath)[0]
     except:
         print("[-]failed!Please rename the filename again!")
-        moveFailedFolder(filepath, conf)
+        moveFailedFolder(filepath)
         return
 
 
@@ -516,7 +520,8 @@ def debug_print(data: json):
         pass
 
 
-def core_main(file_path, number_th, conf: config.Config):
+def core_main(file_path, number_th):
+    conf = config.getInstance()
     # =======================================================================初始化所需变量
     multi_part = 0
     part = ''
@@ -530,11 +535,11 @@ def core_main(file_path, number_th, conf: config.Config):
     # 下面被注释的变量不需要
     #rootpath= os.getcwd
     number = number_th
-    json_data = get_data_from_json(number, conf)  # 定义番号
+    json_data = get_data_from_json(number)  # 定义番号
 
     # Return if blank dict returned (data not found)
     if not json_data:
-        moveFailedFolder(filepath, conf)
+        moveFailedFolder(filepath)
         return
 
     if json_data["number"] != number:
@@ -549,7 +554,7 @@ def core_main(file_path, number_th, conf: config.Config):
     # =======================================================================判断-C,-CD后缀
     if '-CD' in filepath or '-cd' in filepath:
         multi_part = 1
-        part = get_part(filepath, conf)
+        part = get_part(filepath)
     if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath:
         cn_sub = '1'
         c_word = '-C'  # 中文字幕影片后缀
@@ -573,7 +578,7 @@ def core_main(file_path, number_th, conf: config.Config):
         debug_print(json_data)
 
     # 创建文件夹
-    #path = create_folder(rootpath + '/' + conf.success_folder(),  json_data.get('location_rule'), json_data, conf)
+    #path = create_folder(rootpath + '/' + conf.success_folder(),  json_data.get('location_rule'), json_data)
 
     # main_mode
     #  1: 刮削模式 / Scraping mode
@@ -581,28 +586,28 @@ def core_main(file_path, number_th, conf: config.Config):
     #  3：不改变路径刮削
     if conf.main_mode() == 1:
         # 创建文件夹
-        path = create_folder(json_data, conf)
+        path = create_folder(json_data)
         if multi_part == 1:
             number += part  # 这时number会被附加上CD1后缀
 
         # 检查小封面, 如果image cut为3，则下载小封面
         if imagecut == 3:
-            small_cover_check(path, number,  json_data.get('cover_small'), leak_word, c_word, conf, filepath)
+            small_cover_check(path, number,  json_data.get('cover_small'), leak_word, c_word, filepath)
 
         # creatFolder会返回番号路径
-        image_download( json_data.get('cover'), number, leak_word, c_word, path, conf, filepath)
+        image_download( json_data.get('cover'), number, leak_word, c_word, path, filepath)
 
         if not multi_part or part.lower() == '-cd1':
             try:
                 # 下载预告片
                 if conf.is_trailer() and json_data.get('trailer'):
-                    trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
+                    trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
             except:
                 pass
             try:
-                # 下载剧照 data, path, conf: config.Config, filepath
+                # 下载剧照 data, path, filepath
                 if conf.is_extrafanart() and json_data.get('extrafanart'):
-                    extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
+                    extrafanart_download(json_data.get('extrafanart'), path, filepath)
             except:
                 pass
 
@@ -613,23 +618,23 @@ def core_main(file_path, number_th, conf: config.Config):
         poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
         thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
         if conf.is_watermark():
-            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
 
         # 移动电影
-        paste_file_to_folder(filepath, path, number, leak_word, c_word, conf)
+        paste_file_to_folder(filepath, path, number, leak_word, c_word)
 
         # 最后输出.nfo元数据文件，以完成.nfo文件创建作为任务成功标志
-        print_files(path, leak_word, c_word,  json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag,  json_data.get('actor_list'), liuchu, uncensored, conf)
+        print_files(path, leak_word, c_word,  json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag,  json_data.get('actor_list'), liuchu, uncensored)
 
     elif conf.main_mode() == 2:
         # 创建文件夹
-        path = create_folder(json_data, conf)
+        path = create_folder(json_data)
         # 移动文件
-        paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf)
+        paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word)
         poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
         thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
         if conf.is_watermark():
-            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
 
     elif conf.main_mode() == 3:
         path = str(Path(file_path).parent)
@@ -638,19 +643,19 @@ def core_main(file_path, number_th, conf: config.Config):
 
         # 检查小封面, 如果image cut为3，则下载小封面
         if imagecut == 3:
-            small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath)
+            small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
 
         # creatFolder会返回番号路径
-        image_download(json_data.get('cover'), number, leak_word, c_word, path, conf, filepath)
+        image_download(json_data.get('cover'), number, leak_word, c_word, path, filepath)
 
         if not multi_part or part.lower() == '-cd1':
             # 下载预告片
             if conf.is_trailer() and json_data.get('trailer'):
-                trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
+                trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
 
-            # 下载剧照 data, path, conf: config.Config, filepath
+            # 下载剧照 data, path, filepath
             if conf.is_extrafanart() and json_data.get('extrafanart'):
-                extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
+                extrafanart_download(json_data.get('extrafanart'), path, filepath)
 
         # 裁剪图
         cutImage(imagecut, path, number, leak_word, c_word)
@@ -659,8 +664,8 @@ def core_main(file_path, number_th, conf: config.Config):
         poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
         thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
         if conf.is_watermark():
-            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
 
         # 最后输出.nfo元数据文件，以完成.nfo文件创建作为任务成功标志
         print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
-                    tag, json_data.get('actor_list'), liuchu, uncensored, conf)
+                    tag, json_data.get('actor_list'), liuchu, uncensored)

From b0959d1b18f931b052f3c3067fe13e578ff75d9e Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sat, 9 Oct 2021 20:29:17 +0800
Subject: [PATCH 22/56] =?UTF-8?q?javdb:=E6=97=A0=E6=9C=89=E6=95=88?=
 =?UTF-8?q?=E6=9C=9F=E5=86=85cookies=E6=96=87=E4=BB=B6=E6=97=B6=EF=BC=8C?=
 =?UTF-8?q?=E9=9A=8F=E6=9C=BA=E9=80=89=E6=8B=A9=E4=B8=80=E4=B8=AA=E7=AB=99?=
 =?UTF-8?q?=E7=82=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/javdb.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 841d8d6..7d69404 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -214,7 +214,7 @@ def getSeries(a):
 
 def main(number):
     # javdb更新后同一时间只能登录一个数字站，最新登录站会踢出旧的登录，因此按找到的第一个javdb*.json文件选择站点，
-    # 如果无.json文件则按选择最后一个站点。
+    # 如果无.json文件或者超过有效期，则随机选择一个站点。
     javdb_sites = ["javdb31", "javdb32"]
     debug =  config.getInstance().debug()
     try:
@@ -225,6 +225,7 @@ def main(number):
         number = number.upper()
         javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
         # 不加载过期的cookie，javdb登录界面显示为7天免登录，故假定cookie有效期为7天
+        has_json = False
         for cj in javdb_sites:
             javdb_site = cj
             cookie_json = javdb_site + '.json'
@@ -233,9 +234,12 @@ def main(number):
                 cdays = file_modification_days(cookies_filepath)
                 if cdays < 7:
                     javdb_cookies = cookies_dict
+                    has_json = True
+                    break
                 elif cdays != 9999:
                     print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
-                break
+        if not has_json:
+            javdb_site = secrets.choice(javdb_sites)
         if debug:
             print(f'[!]javdb:select site {javdb_site}')
         try:

From 0933e87944afabc1cdb18c26b272a60fa4554d33 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 10 Oct 2021 17:41:33 +0800
Subject: [PATCH 23/56] fix outline of javbus and javdb which caused by airav
 down

---
 ADC_function.py      |  4 ++--
 WebCrawler/javbus.py | 21 +++++++++++++++++++--
 WebCrawler/javdb.py  |  6 +++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index 09fb11d..4480852 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -103,7 +103,7 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
         return result.text
 
 
-def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
+def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
     browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
     if isinstance(cookies, dict):
         requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
@@ -113,7 +113,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
     result = browser.open(url)
     if not result.ok:
         return ''
-    form = browser.select_form() if form_name is None else browser.select_form(form_name)
+    form = browser.select_form() if form_select is None else browser.select_form(form_select)
     if isinstance(fields, dict):
         for k, v in fields.items():
             browser[k] = v
diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index 1af4359..c2ff11e 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -80,7 +80,7 @@ def getCID(htmlcode):
     string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
     result = re.sub('/.*?.jpg','',string)
     return result
-def getOutline(number):  #获取剧情介绍
+def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时更名，等无法恢复时删除
     if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
         return ''   # 从airav.py过来的调用不计算outline直接返回，避免重复抓取数据拖慢处理速度
     try:
@@ -91,6 +91,23 @@ def getOutline(number):  #获取剧情介绍
     except:
         pass
     return ''
+def getOutline(number):  #获取剧情介绍 从avno1.cc取得
+    try:
+        number_up = number.upper()
+        result, browser = get_html_by_form('http://www.avno1.cc/cn/usercenter.php?item=pay_support',
+            form_select='div.wrapper > div.header > div.search > form',
+            fields = {'kw' : number_up},
+            return_type = 'browser')
+        if not result.ok:
+            raise
+        title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
+        page_number = title[title.rfind(' '):].upper()
+        if not number_up in page_number:
+            raise
+        return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
+    except:
+        pass
+    return ''
 def getSerise(htmlcode):   #获取系列 已修改
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     # 如果记录中冇导演，系列排在第6位
@@ -198,7 +215,7 @@ def main(number):
         return js
 
 if __name__ == "__main__" :
-    print(main('ADV-R0624'))    # 404
+    #print(main('ADV-R0624'))    # 404
     print(main('ipx-292'))
     print(main('CEMD-011'))
     print(main('CJOD-278'))
diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 7d69404..358682d 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -196,7 +196,7 @@ def getDirector(a):
     result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
     result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
-def getOutline(number):  #获取剧情介绍
+def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时更名，等无法恢复时删除
     try:
         htmlcode = get_html('https://cn.airav.wiki/video/' + number)
         from WebCrawler.airav import getOutline as airav_getOutline
@@ -205,6 +205,9 @@ def getOutline(number):  #获取剧情介绍
     except:
         pass
     return ''
+def getOutline(number):  #获取剧情介绍
+    from WebCrawler.javbus import getOutline as javbus_getOutline
+    return javbus_getOutline(number)
 def getSeries(a):
     #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
     html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -340,6 +343,7 @@ if __name__ == "__main__":
     # print(main('blacked.20.05.30'))
     # print(main('AGAV-042'))
     # print(main('BANK-022'))
+    print(main('070116-197'))
     print(main('093021_539'))  # 没有剧照 片商pacopacomama
     # print(main('FC2-2278260'))
     # print(main('FC2-735670'))

From e5abac9138ee630d4cf978d7644a391fd9677e77 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 10 Oct 2021 18:02:53 +0800
Subject: [PATCH 24/56] add download_only_missing_image config item

---
 config.ini |  1 +
 config.py  |  3 +++
 core.py    | 10 ++++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/config.ini b/config.ini
index f33a578..06eda0c 100755
--- a/config.ini
+++ b/config.ini
@@ -18,6 +18,7 @@ nfo_skip_days=30
 stop_counter=0
 ; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
 ignore_failed_list=0
+download_only_missing_images=1
 
 [proxy]
 ;proxytype: http or socks5 or socks5h switch: 0 1
diff --git a/config.py b/config.py
index 2b49ca0..3b325d9 100644
--- a/config.py
+++ b/config.py
@@ -141,6 +141,8 @@ class Config:
             return 0
     def ignore_failed_list(self) -> bool:
         return self.getboolean_override("common", "ignore_failed_list")
+    def download_only_missing_images(self) -> bool:
+        return self.conf.getboolean("common", "download_only_missing_images")
     def is_transalte(self) -> bool:
         return self.conf.getboolean("transalte", "switch")
     def is_trailer(self) -> bool:
@@ -264,6 +266,7 @@ class Config:
         conf.set(sec1, "nfo_skip_days", 30)
         conf.set(sec1, "stop_counter", 0)
         conf.set(sec1, "ignore_failed_list", 0)
+        conf.set(sec1, "download_only_missing_images", 1)
 
         sec2 = "proxy"
         conf.add_section(sec2)
diff --git a/core.py b/core.py
index 94a8503..6a8af37 100755
--- a/core.py
+++ b/core.py
@@ -183,11 +183,15 @@ def trailer_download(trailer, leak_word, c_word, number, path, filepath):
 # 剧照下载成功，否则移动到failed
 def extrafanart_download(data, path, filepath):
     j = 1
-    path = os.path.join(path, config.getInstance().get_extrafanart())
-    configProxy = config.getInstance().proxy()
+    conf = config.getInstance()
+    path = os.path.join(path, conf.get_extrafanart())
+    configProxy = conf.proxy()
+    download_only_missing_images = conf.download_only_missing_images()
     for url in data:
         jpg_filename = f'extrafanart-{j}.jpg'
         jpg_fullpath = os.path.join(path, jpg_filename)
+        if download_only_missing_images and os.path.isfile(jpg_fullpath) and os.path.getsize(jpg_fullpath):
+            continue
         if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed':
             moveFailedFolder(filepath)
             return
@@ -209,6 +213,8 @@ def extrafanart_download(data, path, filepath):
 def image_download(cover, number, leak_word, c_word, path, filepath):
     filename = f"{number}{leak_word}{c_word}-fanart.jpg"
     full_filepath = os.path.join(path, filename)
+    if config.getInstance().download_only_missing_images() and os.path.isfile(full_filepath) and os.path.getsize(full_filepath):
+        return
     if download_file_with_filename(cover, filename, path, filepath) == 'failed':
         moveFailedFolder(filepath)
         return

From 678a8f9bc817c32b46e551a22ee886ebed2f42a8 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 11 Oct 2021 10:24:46 +0800
Subject: [PATCH 25/56] Add signal handler

---
 AV_Data_Capture.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 8e1cb76..02ac84b 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -6,6 +6,7 @@ import sys
 import shutil
 import typing
 import urllib3
+import signal
 
 import config
 from datetime import datetime, timedelta
@@ -223,6 +224,15 @@ def close_logfile(logdir: str):
     # 100MB的日志文件能缩小到3.7MB。
 
 
+def signal_handler(*args):
+    print('[!]Ctrl+C detected, Exit.')
+    sys.exit(9)
+
+def sigdebug_handler(*args):
+    config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"]
+    print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF'))
+
+
 # 新增失败文件列表跳过处理，及.nfo修改天数跳过处理，提示跳过视频总数，调试模式(-g)下详细被跳过文件，跳过小广告
 def movie_lists(source_folder, regexstr):
     conf = config.getInstance()
@@ -420,6 +430,11 @@ if __name__ == '__main__':
         print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
         sys.exit(4)
 
+    signal.signal(signal.SIGINT, signal_handler)
+    if sys.platform == 'win32':
+        signal.signal(signal.SIGBREAK, sigdebug_handler)
+    else:
+        signal.signal(signal.SIGWINCH, sigdebug_handler)
     dupe_stdout_to_logfile(logdir)
 
     print('[*]================== AV Data Capture ===================')

From f8dc05a38bad656a5d5ed186ea84ad0cce2ebc43 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 12 Oct 2021 11:28:17 +0800
Subject: [PATCH 26/56] improve javbus and javdb outline source

---
 ADC_function.py      |  2 +-
 WebCrawler/javbus.py | 12 +++++++++++-
 WebCrawler/javdb.py  |  1 -
 WebCrawler/xcity.py  | 13 ++++++++-----
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index 4480852..ed428bd 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -2,7 +2,7 @@ from os import replace
 import requests
 import hashlib
 from pathlib import Path
-#import secrets
+import secrets
 import os.path
 import uuid
 import json
diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index c2ff11e..e739424 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -93,8 +93,12 @@ def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时
     return ''
 def getOutline(number):  #获取剧情介绍 从avno1.cc取得
     try:
+        url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
+                secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
+                '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
+        ]) # 随机选一个，避免网站httpd日志中单个ip的请求太过单一
         number_up = number.upper()
-        result, browser = get_html_by_form('http://www.avno1.cc/cn/usercenter.php?item=pay_support',
+        result, browser = get_html_by_form(url,
             form_select='div.wrapper > div.header > div.search > form',
             fields = {'kw' : number_up},
             return_type = 'browser')
@@ -107,6 +111,12 @@ def getOutline(number):  #获取剧情介绍 从avno1.cc取得
         return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
     except:
         pass
+    from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
+    try:
+        detail_html, browser = open_by_browser(number_up)
+        return xcity_getOutline(detail_html)
+    except:
+        pass
     return ''
 def getSerise(htmlcode):   #获取系列 已修改
     html = etree.fromstring(htmlcode, etree.HTMLParser())
diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 358682d..4b0d4c9 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -5,7 +5,6 @@ from lxml import etree
 import json
 from bs4 import BeautifulSoup
 from ADC_function import *
-import secrets
 # import sys
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py
index 858dd54..4bbdec1 100644
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -181,11 +181,10 @@ def getExtrafanart(htmlcode):  # 获取剧照
             return s
     return ''
 
-def main(number):
-    try:
+def open_by_browser(number):
         xcity_number = number.replace('-','')
         query_result, browser = get_html_by_form(
-            'https://xcity.jp/about/',
+            'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
             fields = {'q' : xcity_number.lower()},
             return_type = 'browser')
         if not query_result or not query_result.ok:
@@ -193,12 +192,16 @@ def main(number):
         result = browser.follow_link(browser.links('avod\/detail')[0])
         if not result.ok:
             raise ValueError("xcity.py: detail page not found")
-        detail_page = str(browser.page)
+        return str(browser.page), browser
+
+def main(number):
+    try:
+        detail_page, browser = open_by_browser(number)
         url = browser.url
         newnum = getNum(detail_page).upper()
         number_up = number.upper()
         if newnum != number_up:
-            if newnum == xcity_number.upper():
+            if newnum == number.replace('-','').upper():
                 newnum = number_up
             else:
                 raise ValueError("xcity.py: number not found")

From c0a4ce638c0bb86de8506d03926ea1cb82361833 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 12 Oct 2021 11:29:53 +0800
Subject: [PATCH 27/56] call moveFailedFolder when empty number on debug branch

---
 AV_Data_Capture.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 02ac84b..9ae551b 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -369,6 +369,7 @@ def create_data_and_move(file_path: str, zero_op):
             core_main(file_path, n_number)
         else:
             print("[-] number empty ERROR")
+            moveFailedFolder(file_path)
         print("[*]======================================================")
     else:
         try:

From f26987ddf96cac2556137a80cc7e21a953b26883 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 12 Oct 2021 11:42:30 +0800
Subject: [PATCH 28/56] move into try block

---
 WebCrawler/javbus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index e739424..46628cf 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -111,9 +111,9 @@ def getOutline(number):  #获取剧情介绍 从avno1.cc取得
         return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
     except:
         pass
-    from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
     try:
-        detail_html, browser = open_by_browser(number_up)
+        from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
+        detail_html, browser = open_by_browser(number)
         return xcity_getOutline(detail_html)
     except:
         pass

From 317449c568fb66ac7280a782459f2c1dd604d5a9 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 15 Oct 2021 09:11:40 +0800
Subject: [PATCH 29/56] try fix issue 616: onedrive OSError input/output

---
 ADC_function.py |  3 +++
 core.py         | 16 ++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index ed428bd..30c2ab9 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -622,3 +622,6 @@ def file_modification_days(filename) -> int:
     if days < 0:
         return 9999
     return days
+
+def file_not_exist_or_empty(filepath) -> bool:
+    return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0
diff --git a/core.py b/core.py
index 6a8af37..f38c6f1 100755
--- a/core.py
+++ b/core.py
@@ -170,13 +170,13 @@ def trailer_download(trailer, leak_word, c_word, number, path, filepath):
         return
     configProxy = config.getInstance().proxy()
     for i in range(configProxy.retry):
-        if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0:
+        if file_not_exist_or_empty(path+'/' + number + leak_word + c_word + '-trailer.mp4'):
             print('[!]Video Download Failed! Trying again. [{}/3]', i + 1)
             download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath)
             continue
         else:
             break
-    if os.path.getsize(path + '/' + number + leak_word + c_word + '-trailer.mp4') == 0:
+    if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + '-trailer.mp4'):
         return
     print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4')
 
@@ -190,19 +190,19 @@ def extrafanart_download(data, path, filepath):
     for url in data:
         jpg_filename = f'extrafanart-{j}.jpg'
         jpg_fullpath = os.path.join(path, jpg_filename)
-        if download_only_missing_images and os.path.isfile(jpg_fullpath) and os.path.getsize(jpg_fullpath):
+        if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
             continue
         if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed':
             moveFailedFolder(filepath)
             return
         for i in range(configProxy.retry):
-            if os.path.getsize(jpg_fullpath) == 0:
+            if file_not_exist_or_empty(jpg_fullpath):
                 print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
                 download_file_with_filename(url, jpg_filename, path, filepath)
                 continue
             else:
                 break
-        if os.path.getsize(jpg_fullpath) == 0:
+        if file_not_exist_or_empty(jpg_fullpath):
             return
         print('[+]Image Downloaded!', jpg_fullpath)
         j += 1
@@ -213,7 +213,7 @@ def extrafanart_download(data, path, filepath):
 def image_download(cover, number, leak_word, c_word, path, filepath):
     filename = f"{number}{leak_word}{c_word}-fanart.jpg"
     full_filepath = os.path.join(path, filename)
-    if config.getInstance().download_only_missing_images() and os.path.isfile(full_filepath) and os.path.getsize(full_filepath):
+    if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
         return
     if download_file_with_filename(cover, filename, path, filepath) == 'failed':
         moveFailedFolder(filepath)
@@ -221,13 +221,13 @@ def image_download(cover, number, leak_word, c_word, path, filepath):
 
     configProxy = config.getInstance().proxy()
     for i in range(configProxy.retry):
-        if os.path.getsize(full_filepath) == 0:
+        if file_not_exist_or_empty(full_filepath):
             print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
             download_file_with_filename(cover, filename, path, filepath)
             continue
         else:
             break
-    if os.path.getsize(full_filepath) == 0:
+    if file_not_exist_or_empty(full_filepath):
         return
     print('[+]Image Downloaded!', full_filepath)
     shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg"))

From 416e8be351ce5e9d70b2f4b47cf70c63f44cb724 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 15 Oct 2021 10:07:53 +0800
Subject: [PATCH 30/56] merge PR#612

---
 core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core.py b/core.py
index f38c6f1..ae73af8 100755
--- a/core.py
+++ b/core.py
@@ -85,8 +85,8 @@ def create_folder(json_data):  # 创建文件夹
     if 'title' in conf.location_rule() and len(title) > maxlen:
         shorttitle = title[0:maxlen]
         location_rule = location_rule.replace(title, shorttitle)
-
-    path = os.path.join(success_folder, location_rule).strip()
+    # 当演员为空时，location_rule被计算为'/number'绝对路径，导致路径连接忽略第一个路径参数，因此添加./使其始终为相对路径
+    path = os.path.join(success_folder, f'./{location_rule.strip()}')
     if not os.path.exists(path):
         path = escape_path(path, conf.escape_literals())
         try:

From 7f8d500b134ed286336b9d68a9b71c5c93c1e204 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 15 Oct 2021 21:00:32 +0800
Subject: [PATCH 31/56] correction mechanicalsoup browser with cookies calling
 method

---
 ADC_function.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index 30c2ab9..e5afb4b 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -85,7 +85,12 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
 
 
 def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
-    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
+    if isinstance(cookies, dict) and len(cookies):
+        s = requests.Session()
+        requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
+        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
+    else:
+        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
     configProxy = config.getInstance().proxy()
     if configProxy.enable:
         browser.session.proxies = configProxy.proxies()
@@ -104,9 +109,12 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
 
 
 def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
-    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
-    if isinstance(cookies, dict):
-        requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
+    if isinstance(cookies, dict) and len(cookies):
+        s = requests.Session()
+        requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
+        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
+    else:
+        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
     configProxy = config.getInstance().proxy()
     if configProxy.enable:
         browser.session.proxies = configProxy.proxies()

From 189f4db6161f6393cbd5d3c1c204153a3f0f7b26 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Fri, 15 Oct 2021 21:16:48 +0800
Subject: [PATCH 32/56] javdb:get faster benefit from http keep-alive

---
 WebCrawler/javdb.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 4b0d4c9..185d96b 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -1,4 +1,6 @@
 import sys
+
+from mechanicalsoup.stateful_browser import StatefulBrowser
 sys.path.append('../')
 import re
 from lxml import etree
@@ -246,7 +248,10 @@ def main(number):
             print(f'[!]javdb:select site {javdb_site}')
         try:
             javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
-            query_result = get_html(javdb_url, cookies=javdb_cookies)
+            res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
+            if not res.ok:
+                raise
+            query_result = res.text
         except:
             query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies)
         html = etree.fromstring(query_result, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -267,8 +272,11 @@ def main(number):
                     raise ValueError("number not found")
                 correct_url = urls[0]
         try:
-            javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
-            detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
+            if isinstance(browser, StatefulBrowser):  # get faster benefit from http keep-alive
+                detail_page = browser.open_relative(correct_url).text
+            else:
+                javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
+                detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
         except:
             detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
 
@@ -344,8 +352,8 @@ if __name__ == "__main__":
     # print(main('BANK-022'))
     print(main('070116-197'))
     print(main('093021_539'))  # 没有剧照 片商pacopacomama
-    # print(main('FC2-2278260'))
-    # print(main('FC2-735670'))
+    print(main('FC2-2278260'))
+    print(main('FC2-735670'))
     # print(main('FC2-1174949')) # not found
     print(main('MVSD-439'))
     # print(main('EHM0001')) # not found

From b006aee34d1382e3494b6e94d4e2156e80ead7c9 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 17 Oct 2021 21:21:12 +0800
Subject: [PATCH 33/56] failed_list.txt keep order remove duplication

---
 AV_Data_Capture.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 9ae551b..d9c54b2 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -252,14 +252,14 @@ def movie_lists(source_folder, regexstr):
     failed_set = set()
     if (main_mode == 3 or soft_link) and not conf.ignore_failed_list():
         try:
-            with open(failed_list_txt_path, 'r', encoding='utf-8')  as flt:
-                flist = flt.read().splitlines()
-                failed_set = set(flist)
-            if len(flist) != len(failed_set):
-                with open(failed_list_txt_path, 'w', encoding='utf-8')  as flt:
-                    wtlines = [line + '\n' for line in failed_set]
-                    wtlines.sort()
-                    flt.writelines(wtlines)
+            flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines()
+            failed_set = set(flist)
+            if len(flist) != len(failed_set): # 检查去重并写回，但是不改变failed_list.txt内条目的先后次序，重复的只保留最后的
+                fset = failed_set.copy()
+                for i in range(len(flist)-1, -1, -1):
+                    fset.remove(flist[i]) if flist[i] in fset else flist.pop(i)
+                failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8')
+                assert len(fset) == 0 and len(flist) == len(failed_set)
         except:
             pass
     if not Path(source_folder).is_dir():

From a546c4e83e595f7b1a90a9c37942a1f6162d0e2a Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 17 Oct 2021 21:59:08 +0800
Subject: [PATCH 34/56] Parall query on storyline data

---
 ADC_function.py         |  10 +-
 AV_Data_Capture.py      |   8 +-
 WebCrawler/javbus.py    |  46 +++----
 WebCrawler/javdb.py     |  25 ++--
 WebCrawler/storyline.py | 270 ++++++++++++++++++++++++++++++++++++++++
 config.ini              |  11 +-
 config.py               |  19 +++
 7 files changed, 336 insertions(+), 53 deletions(-)
 create mode 100644 WebCrawler/storyline.py

diff --git a/ADC_function.py b/ADC_function.py
index e5afb4b..e43fe5f 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -85,12 +85,11 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
 
 
 def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
+    s = None
     if isinstance(cookies, dict) and len(cookies):
         s = requests.Session()
         requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
-        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
-    else:
-        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
+    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
     configProxy = config.getInstance().proxy()
     if configProxy.enable:
         browser.session.proxies = configProxy.proxies()
@@ -109,12 +108,11 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
 
 
 def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
+    s = None
     if isinstance(cookies, dict) and len(cookies):
         s = requests.Session()
         requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
-        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
-    else:
-        browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
+    browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
     configProxy = config.getInstance().proxy()
     if configProxy.enable:
         browser.session.proxies = configProxy.proxies()
diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index d9c54b2..6c13e5d 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -416,7 +416,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number):
                 print('[!]', err)
 
 
-if __name__ == '__main__':
+def main():
     version = '5.0.1'
     urllib3.disable_warnings() #Ignore http proxy warning
 
@@ -483,6 +483,7 @@ if __name__ == '__main__':
         count = 0
         count_all = str(len(movie_list))
         print('[+]Find', count_all, 'movies.')
+        print('[*]======================================================')
         stop_count = conf.stop_counter()
         if stop_count<1:
             stop_count = 999999
@@ -517,3 +518,8 @@ if __name__ == '__main__':
         input("Press enter key exit, you can check the error message before you exit...")
 
     sys.exit(0)
+
+import multiprocessing
+if __name__ == '__main__':
+    multiprocessing.freeze_support()
+    main()
diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index 46628cf..c9d53f3 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -6,6 +6,7 @@ from lxml import etree#need install
 from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
+from WebCrawler.storyline import getStoryline
 import inspect
 
 def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
@@ -91,33 +92,8 @@ def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时
     except:
         pass
     return ''
-def getOutline(number):  #获取剧情介绍 从avno1.cc取得
-    try:
-        url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
-                secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
-                '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
-        ]) # 随机选一个，避免网站httpd日志中单个ip的请求太过单一
-        number_up = number.upper()
-        result, browser = get_html_by_form(url,
-            form_select='div.wrapper > div.header > div.search > form',
-            fields = {'kw' : number_up},
-            return_type = 'browser')
-        if not result.ok:
-            raise
-        title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
-        page_number = title[title.rfind(' '):].upper()
-        if not number_up in page_number:
-            raise
-        return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
-    except:
-        pass
-    try:
-        from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
-        detail_html, browser = open_by_browser(number)
-        return xcity_getOutline(detail_html)
-    except:
-        pass
-    return ''
+def getOutline(number, title):  #获取剧情介绍 多进程并发查询
+    return getStoryline(number,title)
 def getSerise(htmlcode):   #获取系列 已修改
     html = etree.fromstring(htmlcode, etree.HTMLParser())
     # 如果记录中冇导演，系列排在第6位
@@ -156,11 +132,12 @@ def main_uncensored(number):
         htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
     if "<title>404 Page Not Found" in htmlcode:
         raise Exception('404 page not found')
+    title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','')
     dic = {
-        'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
+        'title': title,
         'studio': getStudio(htmlcode),
         'year': getYear(htmlcode),
-        'outline': getOutline(number),
+        'outline': getOutline(number, title),
         'runtime': getRuntime(htmlcode),
         'director': getDirector(htmlcode),
         'actor': getActor(htmlcode),
@@ -189,11 +166,12 @@ def main(number):
                 htmlcode = get_html('https://www.javbus.com/' + number)
             if "<title>404 Page Not Found" in htmlcode:
                 raise Exception('404 page not found')
+            title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode)))
             dic = {
-                'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
+                'title': title,
                 'studio': getStudio(htmlcode),
                 'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
-                'outline': getOutline(number),
+                'outline': getOutline(number, title),
                 'runtime': getRuntime(htmlcode),
                 'director': getDirector(htmlcode),
                 'actor': getActor(htmlcode),
@@ -225,7 +203,11 @@ def main(number):
         return js
 
 if __name__ == "__main__" :
-    #print(main('ADV-R0624'))    # 404
+    config.G_conf_override['debug_mode:switch'] = True
+    print(main('ABP-888'))
+    print(main('ABP-960'))
+    # print(main('ADV-R0624'))    # 404
+    # print(main('MMNT-010'))
     print(main('ipx-292'))
     print(main('CEMD-011'))
     print(main('CJOD-278'))
diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 185d96b..241de49 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -1,13 +1,11 @@
 import sys
-
-from mechanicalsoup.stateful_browser import StatefulBrowser
 sys.path.append('../')
 import re
 from lxml import etree
 import json
-from bs4 import BeautifulSoup
 from ADC_function import *
-# import sys
+from mechanicalsoup.stateful_browser import StatefulBrowser
+from WebCrawler.storyline import getStoryline
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
 
@@ -206,9 +204,8 @@ def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时
     except:
         pass
     return ''
-def getOutline(number):  #获取剧情介绍
-    from WebCrawler.javbus import getOutline as javbus_getOutline
-    return javbus_getOutline(number)
+def getOutline(number, title):  #获取剧情介绍 多进程并发查询
+    return getStoryline(number,title)
 def getSeries(a):
     #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
     html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
@@ -309,7 +306,7 @@ def main(number):
             'actor': getActor(detail_page),
             'title': title,
             'studio': getStudio(detail_page),
-            'outline': getOutline(number),
+            'outline': getOutline(number, title),
             'runtime': getRuntime(detail_page),
             'director': getDirector(detail_page),
             'release': getRelease(detail_page),
@@ -350,11 +347,13 @@ if __name__ == "__main__":
     # print(main('blacked.20.05.30'))
     # print(main('AGAV-042'))
     # print(main('BANK-022'))
-    print(main('070116-197'))
-    print(main('093021_539'))  # 没有剧照 片商pacopacomama
-    print(main('FC2-2278260'))
-    print(main('FC2-735670'))
+    # print(main('070116-197'))
+    # print(main('093021_539'))  # 没有剧照 片商pacopacomama
+    # print(main('FC2-2278260'))
+    # print(main('FC2-735670'))
     # print(main('FC2-1174949')) # not found
     print(main('MVSD-439'))
     # print(main('EHM0001')) # not found
-    print(main('FC2-2314275'))
+    # print(main('FC2-2314275'))
+    # print(main('EBOD-646'))
+    print(main('LOVE-262'))
diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
new file mode 100644
index 0000000..11142fc
--- /dev/null
+++ b/WebCrawler/storyline.py
@@ -0,0 +1,270 @@
+import sys
+sys.path.append('../')
+import re
+import json
+from ADC_function import *
+from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
+from multiprocessing import Pool
+from difflib import SequenceMatcher
+from unicodedata import category
+
+G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"}
+
+
+# 获取剧情介绍 从列表中的站点同时查，取值优先级从前到后
+def getStoryline(number, title):
+    start_time = time.time()
+    conf = config.getInstance()
+    debug = conf.debug() or conf.storyline_show() == 2
+    storyine_sites = conf.storyline_site().split(',')
+    apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
+    mp_args = ((site, number, title, debug) for site in apply_sites)
+    # choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/
+    with Pool() as proc_pool:
+        result = proc_pool.map(getStoryline_mp, mp_args)
+    if not debug and conf.storyline_show() == 0:
+        for value in result:
+            if isinstance(value, str) and len(value):
+                return value
+        return ''
+    # 以下debug结果输出会写入日志，进程池中的则不会，只在标准输出中显示
+    cnt = len(apply_sites)
+    s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒，结束于{time.strftime("%H:%M:%S")}'
+    first = True
+    sel = ''
+    for i in range(cnt):
+        sl = len(result[i])if isinstance(result[i], str) else 0
+        if sl and first:
+            s += f'，[选中结果{apply_sites[i]}字数:{sl}]'
+            first = False
+            sel = result[i]
+        elif sl:
+            s += f'，{apply_sites[i]}字数:{sl}'
+        else:
+            s += f'，{apply_sites[i]}:空'
+    print(s)
+    return sel
+
+
+def getStoryline_mp(args):
+    return _getStoryline_mp(*args)
+
+
+# 注：新进程的print()不会写入日志中，将来调试修复失效数据源需直接查看标准输出，issue信息需截图屏幕
+def _getStoryline_mp(site, number, title, debug):
+    start_time = time.time()
+    storyline = None
+    if not isinstance(site, str):
+        return storyline
+    elif site == "airav":
+        storyline = getStoryline_airav(number, debug)
+    elif site == "avno1":
+        storyline = getStoryline_avno1(number, debug)
+    elif site == "xcity":
+        storyline = getStoryline_xcity(number, debug)
+    elif site == "amazon":
+        storyline = getStoryline_amazon(title, number, debug)
+    if not debug:
+        return storyline
+    print("[!]MP 进程[{}]运行{:.3f}秒，结束于{}返回结果: {}".format(
+            site,
+            time.time() - start_time,
+            time.strftime("%H:%M:%S"),
+            storyline if isinstance(storyline, str) and len(storyline) else '[空]')
+    )
+    return storyline
+
+
+def getStoryline_airav(number, debug):
+    try:
+        number_up = number
+        site = secrets.choice(('airav.cc','airav4.club'))
+        url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
+        res, browser = get_html_by_browser(url, return_type='browser')
+        if not res.ok:
+            raise ValueError(f"get_html_by_browser('{url}') failed")
+        avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div')
+        if number_up not in avs.select_one('a > h3').text.upper():
+            raise ValueError("number not found")
+        detail_url = avs.select_one('a')['href']
+        res = browser.open_relative(detail_url)
+        if not res.ok:
+            raise ValueError(f"browser.open_relative('{detail_url}') failed")
+        t = browser.page.select_one('head > title').text
+        airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
+        if number.upper() != airav_number:
+            raise ValueError(f"page number ->[{airav_number}] not match")
+        desc = browser.page.select_one('li.introduction > span').text.strip()
+        return desc
+    except Exception as e:
+        if debug:
+            print(f"[-]MP getOutline_amazon Error: {e},number [{number}].")
+        pass
+    return None
+
+
+def getStoryline_avno1(number, debug):  #获取剧情介绍 从avno1.cc取得
+    try:
+        url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
+                secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
+                '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
+        ]) # 随机选一个，避免网站httpd日志中单个ip的请求太过单一
+        number_up = number.upper()
+        result, browser = get_html_by_form(url,
+            form_select='div.wrapper > div.header > div.search > form',
+            fields = {'kw' : number_up},
+            return_type = 'browser')
+        if not result.ok:
+            raise ValueError(f"get_html_by_form('{url}','{number_up}') failed")
+        title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
+        page_number = title[title.rfind(' '):].upper()
+        if not number_up in page_number:
+            raise ValueError(f"page number ->[{page_number}] not match")
+        return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
+    except Exception as e:
+        if debug:
+            print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
+        pass
+    return ''
+
+
+def getStoryline_xcity(number, debug):  #获取剧情介绍 从xcity取得
+    try:
+        #xcity_number = number.replace('-','')
+        query_result, browser = get_html_by_form(
+            'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
+            fields = {'q' : xcity_number.lower()},
+            return_type = 'browser')
+        if not query_result or not query_result.ok:
+            raise ValueError("page not found")
+        result = browser.follow_link(browser.links('avod\/detail')[0])
+        if not result.ok:
+            raise ValueError("detail page not found")
+        return browser.page.select_one('h2.title-detail + p.lead').text.strip()
+    except Exception as e:
+        if debug:
+            print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
+        pass
+    return ''
+
+
+def getStoryline_amazon(q_title, number, debug):
+    if not isinstance(q_title, str) or not len(q_title):
+        return None
+    try:
+        amazon_cookie, _ = load_cookies('amazon.json')
+        cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
+        url = "https://www.amazon.co.jp/s?k=" + q_title
+        res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
+        if not res.ok:
+            raise ValueError("get_html_by_browser() failed")
+        lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
+        if isinstance(lks, list) and len(lks):
+            browser.follow_link(lks[0])
+            cookie = None
+        html = etree.fromstring(str(browser.page), etree.HTMLParser())
+        titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
+        urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
+        if not len(urls) or len(urls) != len(titles):
+            raise ValueError("titles not found")
+        idx = amazon_select_one(titles, q_title, number, debug)
+        if not isinstance(idx, int) or idx < 0:
+            raise ValueError("title and number not found")
+        furl = urls[idx]
+        r = browser.open_relative(furl)
+        if not r.ok:
+            raise ValueError("browser.open_relative()) failed.")
+        lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
+        if isinstance(lks, list) and len(lks):
+            browser.follow_link(lks[0])
+            cookie = None
+
+        ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
+        ama_t = re.sub(r'審査番号:\d+', '', ama_t)
+
+        if cookie is None:
+        # 自动创建的cookies文件放在搜索路径表的末端，最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
+            ama_save = Path.home() / ".local/share/avdc/amazon.json"
+            ama_save.parent.mkdir(parents=True, exist_ok=True)
+            ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
+
+        return ama_t
+
+    except Exception as e:
+        if debug:
+            print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
+        pass
+    return None
+
+# 查货架中DVD和蓝光商品中标题相似度高的
+def amazon_select_one(a_titles, q_title, number, debug):
+    sel = -1
+    ratio = 0
+    que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
+    for loc in range(len(a_titles)):
+        t = a_titles[loc]
+        if re.search(number, t, re.I): # 基本不带番号，但也有极个别有的，找到番号相同的直接通过
+            ratio = 1.0
+            sel = loc
+            save_t_ = t
+            break
+        if not re.search('DVD|Blu-ray', t, re.I):
+            continue
+        ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
+        ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
+        findlen = 0
+        lastpos = -1
+        cnt = len(ama_t)
+        for c in reversed(ama_t):
+            cnt -= 1
+            pos = que_t.rfind(c)
+            if lastpos >= 0:
+                pos_near = que_t[:lastpos].rfind(c)
+                if pos_near < 0:
+                    findlen = 0
+                    lastpos = -1
+                    ama_t = ama_t[:cnt+1]
+                else:
+                    pos = pos_near
+            if pos < 0:
+                if category(c) == 'Nd':
+                    return -1
+                ama_t = ama_t[:cnt]
+                findlen = 0
+                lastpos = -1
+                continue
+            if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
+                findlen += 1
+                lastpos = pos
+                if findlen >= 4:
+                    break
+                continue
+            findlen = 1
+            lastpos = pos
+        if findlen==0:
+            return -1
+        r = SequenceMatcher(None, ama_t, que_t).ratio()
+        if r > ratio:
+            sel = loc
+            ratio = r
+            save_t_ = ama_t
+            if ratio > 0.999:
+                break
+
+    if ratio < 0.5:
+        return -1
+
+    if not debug:
+         # 目前采信相似度高于0.9的结果
+        return sel if ratio >= 0.9 else -1
+
+    # debug 模式下记录识别准确率日志
+    if ratio < 0.9:
+        # 相似度[0.5, 0.9)的淘汰结果单独记录日志
+        (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
+            f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
+        return -1
+    # 被采信的结果日志
+    (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
+        f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
+    return sel
diff --git a/config.ini b/config.ini
index 06eda0c..5125ad3 100755
--- a/config.ini
+++ b/config.ini
@@ -7,7 +7,7 @@ soft_link=0
 failed_move=1
 auto_exit=0
 transalte_to_sc=0
-multi_threading=1
+multi_threading=0
 ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
 actor_gender=female
 del_empty_folder=1
@@ -85,3 +85,12 @@ water=2
 switch=0
 extrafanart_folder=extrafanart
 
+; 剧情简介
+[storyline]
+; website为javbus或javdb时，site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询，取值优先级
+; 从左到右，靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介，xcity和amazon是日语的，由
+; 于amazon商城没有番号信息，选中对应DVD的准确率仅99.6%。如果列表为空则不查询，设置成不查询可大幅提高刮削速度。
+; site=
+site=airav,avno1,xcity,amazon
+; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)，剧情简介失效时可打开2查看原因
+show_result=0
diff --git a/config.py b/config.py
index 3b325d9..3226a55 100644
--- a/config.py
+++ b/config.py
@@ -240,6 +240,20 @@ class Config:
     def debug(self) -> bool:
         return self.getboolean_override("debug_mode", "switch")
 
+    def storyline_site(self) -> str:
+        try:
+            return self.conf.get("storyline", "site")
+        except:
+            return "airav,avno1,xcity,amazon"
+
+    def storyline_show(self) -> int:
+        try:
+            v = self.conf.getint("storyline", "show_result")
+            return v if v in (0,1,2) else 2 if v > 2 else 0
+        except:
+            return 0
+
+
     @staticmethod
     def _exit(sec: str) -> None:
         print("[-] Read config error! Please check the {} section in config.ini", sec)
@@ -333,6 +347,11 @@ class Config:
         conf.set(sec13, "switch", 1)
         conf.set(sec13, "extrafanart_folder", "extrafanart")
 
+        sec14 = "storyline"
+        conf.add_section(sec14)
+        conf.set(sec14, "site", "airav,avno1,xcity,amazon")
+        conf.set(sec14, "show_result", 0)
+
         return conf
 
 

From bc3cda953d2f4636a149b7f73b6110f47707b965 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 17 Oct 2021 22:29:57 +0800
Subject: [PATCH 35/56] fix

---
 WebCrawler/storyline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 11142fc..567c675 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -130,7 +130,7 @@ def getStoryline_avno1(number, debug):  #获取剧情介绍 从avno1.cc取得
 
 def getStoryline_xcity(number, debug):  #获取剧情介绍 从xcity取得
     try:
-        #xcity_number = number.replace('-','')
+        xcity_number = number.replace('-','')
         query_result, browser = get_html_by_form(
             'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
             fields = {'q' : xcity_number.lower()},

From 6624ed7224df104c76f6f577b6a76dd604ea4997 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 17 Oct 2021 22:47:49 +0800
Subject: [PATCH 36/56] clean up

---
 WebCrawler/storyline.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 567c675..5923d7d 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -3,7 +3,6 @@ sys.path.append('../')
 import re
 import json
 from ADC_function import *
-from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline
 from multiprocessing import Pool
 from difflib import SequenceMatcher
 from unicodedata import category

From 3420f918f50137d8e8ba7e23e7d5f490198a0c76 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 17 Oct 2021 22:53:53 +0800
Subject: [PATCH 37/56] fix ratio.txt log lost newline

---
 WebCrawler/storyline.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 5923d7d..5d74d4e 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -261,9 +261,9 @@ def amazon_select_one(a_titles, q_title, number, debug):
     if ratio < 0.9:
         # 相似度[0.5, 0.9)的淘汰结果单独记录日志
         (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
-            f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
+            f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
         return -1
     # 被采信的结果日志
     (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
-        f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}')
+        f' [{number}]  Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
     return sel

From 56bbfe6f240cbc3b3e2dd533513af4cce78f79e7 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Sun, 17 Oct 2021 23:25:19 +0800
Subject: [PATCH 38/56] storyline.py: skip SequenceMatcher when number match

---
 WebCrawler/storyline.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 5d74d4e..b32778d 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -203,10 +203,7 @@ def amazon_select_one(a_titles, q_title, number, debug):
     for loc in range(len(a_titles)):
         t = a_titles[loc]
         if re.search(number, t, re.I): # 基本不带番号，但也有极个别有的，找到番号相同的直接通过
-            ratio = 1.0
-            sel = loc
-            save_t_ = t
-            break
+            return loc
         if not re.search('DVD|Blu-ray', t, re.I):
             continue
         ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))

From c9b96f65ab37d48e3d1b34585207aca051598152 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 18 Oct 2021 08:47:11 +0800
Subject: [PATCH 39/56] one line file copy

---
 config.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/config.py b/config.py
index 3226a55..5624e85 100644
--- a/config.py
+++ b/config.py
@@ -73,8 +73,7 @@ class Config:
             # 用户目录才确定具有写权限，因此选择 ~/avdc.ini 作为配置文件生成路径，而不是有可能并没有写权限的
             # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了，只是作为多配置文件的切换技巧保留。
             write_path = path_search_order[2]   # Path.home() / "avdc.ini"
-            with open(write_path, 'w', encoding='utf-8') as wcfg:
-                wcfg.write(res_path.read_text(encoding='utf-8'))
+            write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8')
             print("Config file '{}' created.".format(write_path.resolve()))
             input("Press Enter key exit...")
             sys.exit(0)

From 24b4f9f5e25a04b6b7e27f5e5f9ab8b03ab0bae5 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 18 Oct 2021 10:51:32 +0800
Subject: [PATCH 40/56] =?UTF-8?q?=E5=B0=86=E5=85=83=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E7=9A=84=E6=9D=A5=E6=BA=90=E7=BD=91=E7=AB=99=E8=AE=B0=E5=85=A5?=
 =?UTF-8?q?=E6=97=A5=E5=BF=97=E4=BB=A5=E4=BE=BF=E8=BF=9B=E8=A1=8C=E8=AF=84?=
 =?UTF-8?q?=E4=BC=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ADC_function.py         | 4 ++--
 WebCrawler/__init__.py  | 2 ++
 WebCrawler/storyline.py | 2 +-
 config.py               | 4 ++--
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index e43fe5f..5b1d507 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -1,6 +1,6 @@
 from os import replace
 import requests
-import hashlib
+#import hashlib
 from pathlib import Path
 import secrets
 import os.path
@@ -20,7 +20,7 @@ def getXpathSingle(htmlcode, xpath):
     return result1
 
 
-G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
+G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
 
 # 网页请求核心
 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index c5d02b5..b6e7b2f 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -115,6 +115,7 @@ def get_data_from_json(file_number):  # 从JSON返回元数据
             json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
             # if any service return a valid return, break
             if get_data_state(json_data):
+                print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
                 break
         pool.close()
         pool.terminate()
@@ -126,6 +127,7 @@ def get_data_from_json(file_number):  # 从JSON返回元数据
                 json_data = json.loads(func_mapping[source](file_number))
                 # if any service return a valid return, break
                 if get_data_state(json_data):
+                    print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
                     break
             except:
                 break
diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index b32778d..5ad4fd7 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -34,7 +34,7 @@ def getStoryline(number, title):
     for i in range(cnt):
         sl = len(result[i])if isinstance(result[i], str) else 0
         if sl and first:
-            s += f'，[选中结果{apply_sites[i]}字数:{sl}]'
+            s += f'，[选中{apply_sites[i]}字数:{sl}]'
             first = False
             sel = result[i]
         elif sl:
diff --git a/config.py b/config.py
index 5624e85..abe030e 100644
--- a/config.py
+++ b/config.py
@@ -394,10 +394,10 @@ if __name__ == "__main__":
         code = compile(evstr, "<string>", "eval")
         print('{}: "{}"'.format(evstr, eval(code)))
     config = Config()
-    mfilter = ('conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path')
+    mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'}
     for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
         evprint(f'config.{_m}()')
-    pfilter = ('proxies', 'SUPPORT_PROXY_TYPE')
+    pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}
     # test getInstance()
     assert(getInstance() == config)
     for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:

From f5539279136ac1148f19e1e71fbde99d9158881d Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 18 Oct 2021 17:58:21 +0800
Subject: [PATCH 41/56] =?UTF-8?q?=E6=8F=90=E9=80=9F=EF=BC=8C=E6=9A=82?=
 =?UTF-8?q?=E6=97=B6=E5=B1=8F=E8=94=BD=E6=9C=AA=E5=AE=9E=E7=8E=B0=E7=9A=84?=
 =?UTF-8?q?=E6=BC=94=E5=91=98=E7=85=A7=E7=89=87=E5=8A=9F=E8=83=BD=20javdb?=
 =?UTF-8?q?=20javbus?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/javbus.py | 2 +-
 WebCrawler/javdb.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index c9d53f3..0959e1e 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -182,7 +182,7 @@ def main(number):
                 'tag': getTag(htmlcode),
                 'extrafanart': getExtrafanart(htmlcode),
                 'label': getSerise(htmlcode),
-                'actor_photo': getActorPhoto(htmlcode),
+#                'actor_photo': getActorPhoto(htmlcode),
                 'website': 'https://www.javbus.com/' + number,
                 'source': 'javbus.py',
                 'series': getSerise(htmlcode),
diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 241de49..34cfc32 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -319,7 +319,7 @@ def main(number):
             'tag': getTag(detail_page),
             'label': getLabel(detail_page),
             'year': getYear(detail_page),  # str(re.search('\d{4}',getRelease(a)).group()),
-            'actor_photo': getActorPhoto(detail_page),
+#            'actor_photo': getActorPhoto(detail_page),
             'website': 'https://javdb.com' + correct_url,
             'source': 'javdb.py',
             'series': getSeries(detail_page),

From 5ef16e3a6de5c3bb9381a6ceb9fe4b71619bcd81 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 18 Oct 2021 18:09:36 +0800
Subject: [PATCH 42/56] =?UTF-8?q?=E5=89=A7=E6=83=85=E7=AE=80=E4=BB=8B?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=BF=90=E8=A1=8C=E6=A8=A1=E5=BC=8Frun=5Fmod?=
 =?UTF-8?q?e,=200:=E9=A1=BA=E5=BA=8F=E6=89=A7=E8=A1=8C=201:=E7=BA=BF?=
 =?UTF-8?q?=E7=A8=8B=E6=B1=A0=202:=E8=BF=9B=E7=A8=8B=E6=B1=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/storyline.py | 23 +++++++++++++++++++----
 config.ini              |  3 +++
 config.py               |  7 +++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 5ad4fd7..d9da869 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -2,13 +2,25 @@ import sys
 sys.path.append('../')
 import re
 import json
+import builtins
 from ADC_function import *
 from multiprocessing import Pool
+from multiprocessing.dummy import Pool as ThreadPool
 from difflib import SequenceMatcher
 from unicodedata import category
 
 G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"}
 
+G_mode_txt = ('顺序执行','线程池','进程池')
+
+class noThread(object):
+    def map(self, fn, param):
+        return builtins.map(fn, param)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
 
 # 获取剧情介绍 从列表中的站点同时查，取值优先级从前到后
 def getStoryline(number, title):
@@ -18,9 +30,12 @@ def getStoryline(number, title):
     storyine_sites = conf.storyline_site().split(',')
     apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
     mp_args = ((site, number, title, debug) for site in apply_sites)
-    # choose process pool not thread pool because https://www.python.org/dev/peps/pep-0371/
-    with Pool() as proc_pool:
-        result = proc_pool.map(getStoryline_mp, mp_args)
+    cores = min(len(apply_sites), os.cpu_count())
+    run_mode = conf.storyline_mode()
+    assert run_mode in (0,1,2)
+    with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
+        result = pool.map(getStoryline_mp, mp_args)
+    result = list(result) if run_mode == 0 else result
     if not debug and conf.storyline_show() == 0:
         for value in result:
             if isinstance(value, str) and len(value):
@@ -28,7 +43,7 @@ def getStoryline(number, title):
         return ''
     # 以下debug结果输出会写入日志，进程池中的则不会，只在标准输出中显示
     cnt = len(apply_sites)
-    s = f'[!]MP Storyline 运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒，结束于{time.strftime("%H:%M:%S")}'
+    s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒，结束于{time.strftime("%H:%M:%S")}'
     first = True
     sel = ''
     for i in range(cnt):
diff --git a/config.ini b/config.ini
index 5125ad3..700fa95 100755
--- a/config.ini
+++ b/config.ini
@@ -92,5 +92,8 @@ extrafanart_folder=extrafanart
 ; 于amazon商城没有番号信息，选中对应DVD的准确率仅99.6%。如果列表为空则不查询，设置成不查询可大幅提高刮削速度。
 ; site=
 site=airav,avno1,xcity,amazon
+; 运行模式：0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大，并发站点越多越快)
+run_mode=1
 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)，剧情简介失效时可打开2查看原因
 show_result=0
+
diff --git a/config.py b/config.py
index abe030e..83a36bc 100644
--- a/config.py
+++ b/config.py
@@ -252,6 +252,12 @@ class Config:
         except:
             return 0
 
+    def storyline_mode(self) -> int:
+        try:
+            v = self.conf.getint("storyline", "run_mode")
+            return v if v in (0,1,2) else 2 if v > 2 else 0
+        except:
+            return 1
 
     @staticmethod
     def _exit(sec: str) -> None:
@@ -350,6 +356,7 @@ class Config:
         conf.add_section(sec14)
         conf.set(sec14, "site", "airav,avno1,xcity,amazon")
         conf.set(sec14, "show_result", 0)
+        conf.set(sec14, "run_mode", 1)
 
         return conf
 

From 4428971135149749bc74c286ae18a0eb551570bc Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Mon, 18 Oct 2021 19:52:42 +0800
Subject: [PATCH 43/56] =?UTF-8?q?javdb.py:=20=E4=BC=98=E5=8C=96=EF=BC=8C?=
 =?UTF-8?q?=E4=BF=AE=E7=90=86getActorPhoto()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/javdb.py | 103 +++++++++++++++++++-------------------------
 1 file changed, 44 insertions(+), 59 deletions(-)

diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index 34cfc32..e4e803c 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -9,13 +9,11 @@ from WebCrawler.storyline import getStoryline
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
 
-def getTitle(a):
-    html = etree.fromstring(a, etree.HTMLParser())
+def getTitle(html):
     browser_title = str(html.xpath("/html/head/title/text()")[0])
     return browser_title[:browser_title.find(' | JavDB')].strip()
 
-def getActor(a):
-    html = etree.fromstring(a, etree.HTMLParser())
+def getActor(html):
     actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
     genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
     r = []
@@ -32,8 +30,8 @@ def getActor(a):
         idx = idx + 1
     return r
 
-def getaphoto(url):
-    html_page = get_html(url)
+def getaphoto(url,  browser):
+    html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
     img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
     img_url = img_prether.findall(html_page)
     if img_url:
@@ -41,24 +39,18 @@ def getaphoto(url):
     else:
         return ''
 
-def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img
-    actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>')
-    actorall = actorall_prether.findall(html)
-
-    if actorall:
-        actoralls = actorall[0]
-        actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>')
-        actor = actor_prether.findall(actoralls)
-        actor_photo = {}
-        for i in actor:
-            actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0])
-
-        return actor_photo
-
-    else:
+def getActorPhoto(html, javdb_site,  browser): #//*[@id="star_qdt"]/li/a/img
+    actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
+    if not actorall:
         return {}
+    a = getActor(html)
+    actor_photo = {}
+    for i in actorall:
+        if i.text in a:
+            actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
+    return actor_photo
 
-def getStudio(a):
+def getStudio(a, html):
     # html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
     # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
     # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
@@ -70,25 +62,21 @@ def getStudio(a):
         if len(result):
             return result
     # 以卖家作为工作室
-    html = etree.fromstring(a, etree.HTMLParser())
     try:
         result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
     except:
         result = ''
     return result
 
-def getRuntime(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getRuntime(html):
     result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
     result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
     return str(result1 + result2).strip('+').rstrip('mi')
-def getLabel(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getLabel(html):
     result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
     result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
-def getNum(a):
-    html = etree.fromstring(a, etree.HTMLParser())
+def getNum(html):
     result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
     result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
     return str(result2 + result1).strip('+')
@@ -118,8 +106,7 @@ def getRelease(a):
     else:
         result = ''
     return result
-def getTag(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getTag(html):
     try:
         result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
         total = []
@@ -140,11 +127,10 @@ def getTag(a):
                 pass
         return total
 
-def getCover_small(a, index=0):
+def getCover_small(html, index=0):
     # same issue mentioned below,
     # javdb sometime returns multiple results
     # DO NOT just get the firt one, get the one with correct index number
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
     try:
         result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
         if not 'https' in result:
@@ -175,23 +161,20 @@ def getTrailer(htmlcode):  # 获取预告片
         video_url = ''
     return video_url
 
-def getExtrafanart(htmlcode):  # 获取剧照
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getExtrafanart(html):  # 获取剧照
     result = []
     try:
         result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
     except:
         pass
     return result
-def getCover(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover(html):
     try:
         result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
     except: # 2020.7.17 Repair Cover Url crawl
         result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
     return result
-def getDirector(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getDirector(html):
     result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
     result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
@@ -206,9 +189,7 @@ def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时
     return ''
 def getOutline(number, title):  #获取剧情介绍 多进程并发查询
     return getStoryline(number,title)
-def getSeries(a):
-    #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getSeries(html):
     result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
     result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
     return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
@@ -243,6 +224,7 @@ def main(number):
             javdb_site = secrets.choice(javdb_sites)
         if debug:
             print(f'[!]javdb:select site {javdb_site}')
+        browser = None
         try:
             javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
             res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
@@ -277,52 +259,54 @@ def main(number):
         except:
             detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
 
+        # etree.fromstring开销很大，最好只用一次，而它的xpath很快，比bs4 find/select快，可以多用
+        lx = etree.fromstring(detail_page, etree.HTMLParser())
         # no cut image by default
         imagecut = 3
         # If gray image exists ,then replace with normal cover
         if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
-            cover_small = getCover_small(query_result)
+            cover_small = getCover_small(html)
         else:
             try:
-                cover_small = getCover_small(query_result, index=ids.index(number))
+                cover_small = getCover_small(html, index=ids.index(number))
             except:
                 # if input number is "STAR438" not "STAR-438", use first search result.
-                cover_small = getCover_small(query_result)
+                cover_small = getCover_small(html)
         if 'placeholder' in cover_small:
             # replace wit normal cover and cut it
             imagecut = 1
-            cover_small = getCover(detail_page)
+            cover_small = getCover(lx)
 
-        dp_number = getNum(detail_page)
+        dp_number = getNum(lx)
         if dp_number.upper() != number:
             raise ValueError("number not found")
-        title = getTitle(detail_page)
+        title = getTitle(lx)
         if title and dp_number:
             number = dp_number
             # remove duplicate title
             title = title.replace(number, '').strip()
 
         dic = {
-            'actor': getActor(detail_page),
+            'actor': getActor(lx),
             'title': title,
-            'studio': getStudio(detail_page),
+            'studio': getStudio(detail_page, lx),
             'outline': getOutline(number, title),
-            'runtime': getRuntime(detail_page),
-            'director': getDirector(detail_page),
+            'runtime': getRuntime(lx),
+            'director': getDirector(lx),
             'release': getRelease(detail_page),
             'number': number,
-            'cover': getCover(detail_page),
+            'cover': getCover(lx),
             'cover_small': cover_small,
             'trailer': getTrailer(detail_page),
-            'extrafanart': getExtrafanart(detail_page),
+            'extrafanart': getExtrafanart(lx),
             'imagecut': imagecut,
-            'tag': getTag(detail_page),
-            'label': getLabel(detail_page),
+            'tag': getTag(lx),
+            'label': getLabel(lx),
             'year': getYear(detail_page),  # str(re.search('\d{4}',getRelease(a)).group()),
-#            'actor_photo': getActorPhoto(detail_page),
+#            'actor_photo': getActorPhoto(lx, javdb_site,  browser),
             'website': 'https://javdb.com' + correct_url,
             'source': 'javdb.py',
-            'series': getSeries(detail_page),
+            'series': getSeries(lx),
 
         }
         if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
@@ -356,4 +340,5 @@ if __name__ == "__main__":
     # print(main('EHM0001')) # not found
     # print(main('FC2-2314275'))
     # print(main('EBOD-646'))
-    print(main('LOVE-262'))
+    # print(main('LOVE-262'))
+    print(main('ABP-890'))

From dd106453f76e19c40b31bf7559f607559c3f4bd0 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 00:03:51 +0800
Subject: [PATCH 44/56] =?UTF-8?q?=E5=AF=B9=E6=A0=87=E8=AE=B0=E4=B8=BA?=
 =?UTF-8?q?=E5=88=A0=E9=99=A4=E7=9A=84tag=E8=BF=9B=E8=A1=8C=E6=B8=85?=
 =?UTF-8?q?=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index b6e7b2f..44f9094 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -178,6 +178,10 @@ def get_data_from_json(file_number):  # 从JSON返回元数据
 
     imagecut = json_data.get('imagecut')
     tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',')  # 字符串转列表 @
+    while 'XXXX' in tag:
+        tag.remove('XXXX')
+    while 'xxx' in tag:
+        tag.remove('xxx')
     actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
 
     if title == '' or number == '':
@@ -306,4 +310,5 @@ def special_characters_replacement(text) -> str:
                 replace('"', '＂').      # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
                 replace('<', 'ᐸ').       # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
                 replace('>', 'ᐳ').       # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
-                replace('|', 'ǀ'))       # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+                replace('|', 'ǀ').       # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+                replace('&amp;', '＆'))

From d80b2eeb7d344b69a22fcec61380bcf30c7b8bd4 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 00:14:26 +0800
Subject: [PATCH 45/56] =?UTF-8?q?javbus.py:=20=E4=BC=98=E5=8C=96=EF=BC=8C?=
 =?UTF-8?q?=E4=BF=AE=E7=90=86=E6=97=A0=E7=A0=81=E7=89=87=E7=9A=84=E5=AF=BC?=
 =?UTF-8?q?=E6=BC=94=E3=80=81=E7=B3=BB=E5=88=97=E7=AD=89=E5=AD=97=E6=AE=B5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/javbus.py | 187 ++++++++++++++++++-------------------------
 1 file changed, 80 insertions(+), 107 deletions(-)

diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index 0959e1e..f17a1ab 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -3,81 +3,61 @@ sys.path.append('../')
 import re
 from pyquery import PyQuery as pq#need install
 from lxml import etree#need install
-from bs4 import BeautifulSoup#need install
 import json
 from ADC_function import *
 from WebCrawler.storyline import getStoryline
 import inspect
 
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
-    soup = BeautifulSoup(htmlcode, 'lxml')
-    a = soup.find_all(attrs={'class': 'star-name'})
+def getActorPhoto(doc): #//*[@id="star_qdt"]/li/a/img
+    actors = doc('div.star-name a').items()
     d={}
-    for i in a:
-        l=i.a['href']
-        t=i.get_text()
-        html = etree.fromstring(get_html(l), etree.HTMLParser())
+    for i in actors:
+        url=i.attr.href
+        t=i.attr.title
+        html = etree.fromstring(get_html(url), etree.HTMLParser())
         p=urljoin("https://www.javbus.com",
                   str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
         p2={t:p}
         d.update(p2)
     return d
-def getTitle(htmlcode):  #获取标题
-    doc = pq(htmlcode)
-    title=str(doc('div.container h3').text()).replace(' ','-')
-    try:
-        title2 = re.sub('n\d+-','',title)
-        return title2
-    except:
-        return title
-def getStudio(htmlcode): #获取厂商 已修改
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    # 如果记录中冇导演，厂商排在第4位
-    if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
-    # 如果记录中有导演，厂商排在第5位
-    elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
-    else:
-        result = ''
-    return result
-def getYear(htmlcode):   #获取年份
-    html = etree.fromstring(htmlcode,etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
-    return result
-def getCover(htmlcode):  #获取封面链接
-    doc = pq(htmlcode)
+def getTitle(html):  #获取标题
+    title = str(html.xpath('/html/head/title/text()')[0])
+    title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
+    return title
+def getStudioJa(html):
+    x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
+    return str(x[0]) if len(x) else ''
+def getStudio(html): #获取厂商
+    x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
+    return str(x[0]) if len(x) else ''
+def getYear(html):   #获取年份
+    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
+    return result[:4] if len(result)>=len('2000-01-01') else ''
+def getCover(doc):  #获取封面链接
     image = doc('a.bigImage')
     return urljoin("https://www.javbus.com", image.attr('href'))
-def getRelease(htmlcode): #获取出版日期
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getRelease(html): #获取出版日期
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
     return result
-def getRuntime(htmlcode): #获取分钟 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getRuntime(html): #获取分钟 已修改
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
     return result
-def getActor(htmlcode):   #获取女优
+def getActor(doc):   #获取女优
     b=[]
-    soup=BeautifulSoup(htmlcode,'lxml')
-    a=soup.find_all(attrs={'class':'star-name'})
-    for i in a:
-        b.append(i.get_text())
+    actors = doc('div.star-name a').items()
+    for i in actors:
+        b.append(i.attr.title)
     return b
-def getNum(htmlcode):     #获取番号
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
-    return result
-def getDirector(htmlcode): #获取导演 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
-    else:
-        result = ''         # 记录中有可能没有导演数据
-    return result
-def getCID(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    #print(htmlcode)
+def getNum(html):     #获取番号
+    kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+    return kwdlist[0]
+def getDirectorJa(html):
+    x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
+    return str(x[0]) if len(x) else ''
+def getDirector(html): #获取导演
+    x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
+    return str(x[0]) if len(x) else ''
+def getCID(html):
     string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
     result = re.sub('/.*?.jpg','',string)
     return result
@@ -94,27 +74,16 @@ def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时
     return ''
 def getOutline(number, title):  #获取剧情介绍 多进程并发查询
     return getStoryline(number,title)
-def getSerise(htmlcode):   #获取系列 已修改
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
-    # 如果记录中冇导演，系列排在第6位
-    if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
-    # 如果记录中有导演，系列排在第7位
-    elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
-        result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
-    else:
-        result = ''
-    return result
-def getTag(htmlcode):  # 获取标签
-    tag = []
-    soup = BeautifulSoup(htmlcode, 'lxml')
-    a = soup.find_all(attrs={'class': 'genre'})
-    for i in a:
-        if 'onmouseout' in str(i) or '多選提交' in str(i):
-            continue
-        tag.append(translateTag_to_sc(i.get_text()))
-    return tag
-
+def getSeriseJa(html):
+    x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
+    return str(x[0]) if len(x) else ''
+def getSerise(html):   #获取系列
+    x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
+    return str(x[0]) if len(x) else ''
+def getTag(html):  # 获取标签
+    klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+    taglist = [translateTag_to_sc(v) for v in klist[1:]]
+    return taglist
 def getExtrafanart(htmlcode):  # 获取剧照
     html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
     html = html_pather.search(htmlcode)
@@ -128,30 +97,30 @@ def getExtrafanart(htmlcode):  # 获取剧照
 
 def main_uncensored(number):
     htmlcode = get_html('https://www.javbus.com/ja/' + number)
-    if getTitle(htmlcode) == '':
-        htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
     if "<title>404 Page Not Found" in htmlcode:
         raise Exception('404 page not found')
-    title = str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-','')
+    doc = pq(htmlcode)
+    lx = etree.fromstring(htmlcode, etree.HTMLParser())
+    title = getTitle(lx)
     dic = {
         'title': title,
-        'studio': getStudio(htmlcode),
-        'year': getYear(htmlcode),
+        'studio': getStudioJa(lx),
+        'year': getYear(lx),
         'outline': getOutline(number, title),
-        'runtime': getRuntime(htmlcode),
-        'director': getDirector(htmlcode),
-        'actor': getActor(htmlcode),
-        'release': getRelease(htmlcode),
-        'number': getNum(htmlcode),
-        'cover': getCover(htmlcode),
-        'tag': getTag(htmlcode),
+        'runtime': getRuntime(lx),
+        'director': getDirectorJa(lx),
+        'actor': getActor(doc),
+        'release': getRelease(lx),
+        'number': getNum(lx),
+        'cover': getCover(doc),
+        'tag': getTag(lx),
         'extrafanart': getExtrafanart(htmlcode),
-        'label': getSerise(htmlcode),
+        'label': getSeriseJa(lx),
         'imagecut': 0,
-        'actor_photo': '',
+#        'actor_photo': '',
         'website': 'https://www.javbus.com/ja/' + number,
         'source': 'javbus.py',
-        'series': getSerise(htmlcode),
+        'series': getSeriseJa(lx),
     }
     js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )  # .encode('UTF-8')
     return js
@@ -166,26 +135,28 @@ def main(number):
                 htmlcode = get_html('https://www.javbus.com/' + number)
             if "<title>404 Page Not Found" in htmlcode:
                 raise Exception('404 page not found')
-            title = str(re.sub('\w+-\d+-', '', getTitle(htmlcode)))
+            doc = pq(htmlcode)
+            lx = etree.fromstring(htmlcode,etree.HTMLParser())
+            title = getTitle(lx)
             dic = {
                 'title': title,
-                'studio': getStudio(htmlcode),
-                'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
+                'studio': getStudio(lx),
+                'year': getYear(lx),
                 'outline': getOutline(number, title),
-                'runtime': getRuntime(htmlcode),
-                'director': getDirector(htmlcode),
-                'actor': getActor(htmlcode),
-                'release': getRelease(htmlcode),
-                'number': getNum(htmlcode),
-                'cover': getCover(htmlcode),
+                'runtime': getRuntime(lx),
+                'director': getDirector(lx),
+                'actor': getActor(doc),
+                'release': getRelease(lx),
+                'number': getNum(lx),
+                'cover': getCover(doc),
                 'imagecut': 1,
-                'tag': getTag(htmlcode),
+                'tag': getTag(lx),
                 'extrafanart': getExtrafanart(htmlcode),
-                'label': getSerise(htmlcode),
-#                'actor_photo': getActorPhoto(htmlcode),
+                'label': getSerise(lx),
+#                'actor_photo': getActorPhoto(doc),
                 'website': 'https://www.javbus.com/' + number,
                 'source': 'javbus.py',
-                'series': getSerise(htmlcode),
+                'series': getSerise(lx),
             }
             js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )  # .encode('UTF-8')
             return js
@@ -206,8 +177,10 @@ if __name__ == "__main__" :
     config.G_conf_override['debug_mode:switch'] = True
     print(main('ABP-888'))
     print(main('ABP-960'))
-    # print(main('ADV-R0624'))    # 404
-    # print(main('MMNT-010'))
+    print(main('ADV-R0624'))    # 404
+    print(main('MMNT-010'))
     print(main('ipx-292'))
     print(main('CEMD-011'))
     print(main('CJOD-278'))
+    print(main('100221_001'))
+    print(main('AVSW-061'))

From 5da134986a674ac5d498207eacd1063cf69f4544 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 00:17:45 +0800
Subject: [PATCH 46/56] storyline.py: bug fix

---
 WebCrawler/storyline.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index d9da869..693f404 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -31,6 +31,8 @@ def getStoryline(number, title):
     apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
     mp_args = ((site, number, title, debug) for site in apply_sites)
     cores = min(len(apply_sites), os.cpu_count())
+    if cores == 0:
+        return ''
     run_mode = conf.storyline_mode()
     assert run_mode in (0,1,2)
     with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:

From 249884a27e6d7496bfb0944b8421f9c1c2c71e31 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 00:58:28 +0800
Subject: [PATCH 47/56] =?UTF-8?q?javbus.py:=20=E4=BC=98=E5=8C=96=E6=8F=90?=
 =?UTF-8?q?=E9=80=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/javbus.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index f17a1ab..63457bf 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -1,19 +1,18 @@
 import sys
 sys.path.append('../')
 import re
-from pyquery import PyQuery as pq#need install
 from lxml import etree#need install
 import json
 from ADC_function import *
 from WebCrawler.storyline import getStoryline
 import inspect
 
-def getActorPhoto(doc): #//*[@id="star_qdt"]/li/a/img
-    actors = doc('div.star-name a').items()
+def getActorPhoto(html):
+    actors = html.xpath('//div[@class="star-name"]/a')
     d={}
     for i in actors:
-        url=i.attr.href
-        t=i.attr.title
+        url=i.attrib['href']
+        t=i.attrib['title']
         html = etree.fromstring(get_html(url), etree.HTMLParser())
         p=urljoin("https://www.javbus.com",
                   str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
@@ -33,20 +32,20 @@ def getStudio(html): #获取厂商
 def getYear(html):   #获取年份
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
     return result[:4] if len(result)>=len('2000-01-01') else ''
-def getCover(doc):  #获取封面链接
-    image = doc('a.bigImage')
-    return urljoin("https://www.javbus.com", image.attr('href'))
+def getCover(html):  #获取封面链接
+    image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
+    return urljoin("https://www.javbus.com", image)
 def getRelease(html): #获取出版日期
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
     return result
 def getRuntime(html): #获取分钟 已修改
     result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
     return result
-def getActor(doc):   #获取女优
+def getActor(html):   #获取女优
     b=[]
-    actors = doc('div.star-name a').items()
+    actors = html.xpath('//div[@class="star-name"]/a')
     for i in actors:
-        b.append(i.attr.title)
+        b.append(i.attrib['title'])
     return b
 def getNum(html):     #获取番号
     kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
@@ -99,7 +98,6 @@ def main_uncensored(number):
     htmlcode = get_html('https://www.javbus.com/ja/' + number)
     if "<title>404 Page Not Found" in htmlcode:
         raise Exception('404 page not found')
-    doc = pq(htmlcode)
     lx = etree.fromstring(htmlcode, etree.HTMLParser())
     title = getTitle(lx)
     dic = {
@@ -109,10 +107,10 @@ def main_uncensored(number):
         'outline': getOutline(number, title),
         'runtime': getRuntime(lx),
         'director': getDirectorJa(lx),
-        'actor': getActor(doc),
+        'actor': getActor(lx),
         'release': getRelease(lx),
         'number': getNum(lx),
-        'cover': getCover(doc),
+        'cover': getCover(lx),
         'tag': getTag(lx),
         'extrafanart': getExtrafanart(htmlcode),
         'label': getSeriseJa(lx),
@@ -135,7 +133,6 @@ def main(number):
                 htmlcode = get_html('https://www.javbus.com/' + number)
             if "<title>404 Page Not Found" in htmlcode:
                 raise Exception('404 page not found')
-            doc = pq(htmlcode)
             lx = etree.fromstring(htmlcode,etree.HTMLParser())
             title = getTitle(lx)
             dic = {
@@ -145,15 +142,15 @@ def main(number):
                 'outline': getOutline(number, title),
                 'runtime': getRuntime(lx),
                 'director': getDirector(lx),
-                'actor': getActor(doc),
+                'actor': getActor(lx),
                 'release': getRelease(lx),
                 'number': getNum(lx),
-                'cover': getCover(doc),
+                'cover': getCover(lx),
                 'imagecut': 1,
                 'tag': getTag(lx),
                 'extrafanart': getExtrafanart(htmlcode),
                 'label': getSerise(lx),
-#                'actor_photo': getActorPhoto(doc),
+#                'actor_photo': getActorPhoto(lx),
                 'website': 'https://www.javbus.com/' + number,
                 'source': 'javbus.py',
                 'series': getSerise(lx),

From aae4df73fae2dd0d8788dbd2e8f491fd90d9447c Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 01:00:50 +0800
Subject: [PATCH 48/56] =?UTF-8?q?javbus.py:=20=E6=B8=85=E7=90=86=E8=BF=87?=
 =?UTF-8?q?=E6=9C=9F=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/javbus.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index 63457bf..7866052 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -5,7 +5,6 @@ from lxml import etree#need install
 import json
 from ADC_function import *
 from WebCrawler.storyline import getStoryline
-import inspect
 
 def getActorPhoto(html):
     actors = html.xpath('//div[@class="star-name"]/a')
@@ -60,17 +59,6 @@ def getCID(html):
     string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
     result = re.sub('/.*?.jpg','',string)
     return result
-def getOutline0(number):  #获取剧情介绍 airav.wiki站点404，函数暂时更名，等无法恢复时删除
-    if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
-        return ''   # 从airav.py过来的调用不计算outline直接返回，避免重复抓取数据拖慢处理速度
-    try:
-        htmlcode = get_html('https://cn.airav.wiki/video/' + number)
-        from WebCrawler.airav import getOutline as airav_getOutline
-        result = airav_getOutline(htmlcode)
-        return result
-    except:
-        pass
-    return ''
 def getOutline(number, title):  #获取剧情介绍 多进程并发查询
     return getStoryline(number,title)
 def getSeriseJa(html):

From daf7f5e0a0efdc9aada4f0af784a7d90560bcfc0 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 15:14:15 +0800
Subject: [PATCH 49/56] =?UTF-8?q?carib.py:=20=E5=B0=9D=E8=AF=95=E8=8E=B7?=
 =?UTF-8?q?=E5=8F=96=E4=B8=AD=E6=96=87=E5=89=A7=E6=83=85=E4=BB=8B=E7=BB=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/__init__.py  |  2 ++
 WebCrawler/carib.py     | 38 +++++++++++++++++++++++++-------------
 WebCrawler/storyline.py |  4 ++--
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index 44f9094..039fed0 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -311,4 +311,6 @@ def special_characters_replacement(text) -> str:
                 replace('<', 'ᐸ').       # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
                 replace('>', 'ᐳ').       # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
                 replace('|', 'ǀ').       # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+                replace('&lsquo;', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
+                replace('&rsquo;', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
                 replace('&amp;', '＆'))
diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
index c1a25d9..3e583df 100755
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -4,26 +4,29 @@ import json
 from lxml import html
 import re
 from ADC_function import *
+from WebCrawler.storyline import getStoryline
 
 def main(number: str) -> json:
     try:
-        carib_obj, browser = get_html_by_browser(
-            'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
-            return_type="browser")
-
-        if not carib_obj or not carib_obj.ok:
+        # 因演员图片功能还未使用，为提速暂时注释，改为用get_html()
+        #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
+        #                return_type='browser')
+        #if not r.ok:
+        #    raise ValueError("page not found")
+        #htmlcode = str(browser.page)
+        htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
+        htmlcode = htmlbyte.decode('euc-jp')
+        if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
             raise ValueError("page not found")
 
-        lx = html.fromstring(str(browser.page))
-
-        if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
-            raise ValueError("page info not found")
+        lx = html.fromstring(htmlcode)
+        title = get_title(lx)
 
         dic = {
-            'title': get_title(lx),
+            'title': title,
             'studio': '加勒比',
             'year': get_year(lx),
-            'outline': get_outline(lx),
+            'outline': get_outline(lx, number, title),
             'runtime': get_runtime(lx),
             'director': '',
             'actor': get_actor(lx),
@@ -55,8 +58,17 @@ def get_title(lx: html.HtmlElement) -> str:
 def get_year(lx: html.HtmlElement) -> str:
     return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
 
-def get_outline(lx: html.HtmlElement) -> str:
-    return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
+    o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+
+    storyline_site = config.getInstance().storyline_site().split(',')
+    a = set(storyline_site) & {'airav', 'avno1'}
+    if len(a):
+        site = [n for n in storyline_site if n in a]
+        g = getStoryline(number, title, site)
+        if len(g):
+            return g
+    return o
 
 def get_release(lx: html.HtmlElement) -> str:
     return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 693f404..5c2b91a 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -23,11 +23,11 @@ class noThread(object):
 
 
 # 获取剧情介绍 从列表中的站点同时查，取值优先级从前到后
-def getStoryline(number, title):
+def getStoryline(number, title, sites: list=None):
     start_time = time.time()
     conf = config.getInstance()
     debug = conf.debug() or conf.storyline_show() == 2
-    storyine_sites = conf.storyline_site().split(',')
+    storyine_sites = conf.storyline_site().split(',') if sites is None else sites
     apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
     mp_args = ((site, number, title, debug) for site in apply_sites)
     cores = min(len(apply_sites), os.cpu_count())

From 8559eea29652db4dcf79cd0f741e0b2de7d009bb Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 15:18:39 +0800
Subject: [PATCH 50/56] =?UTF-8?q?avsox.py:=20=E5=85=83=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=89=A7=E6=83=85=E4=BB=8B=E7=BB=8D=E3=80=82?=
 =?UTF-8?q?=E4=BC=98=E5=8C=96:=E5=87=8F=E5=B0=91etree.fromstring=E9=AB=98?=
 =?UTF-8?q?=E5=BC=80=E9=94=80=E8=B0=83=E7=94=A8=E6=AC=A1=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/avsox.py | 53 +++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py
index 293769a..a353690 100644
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -5,12 +5,11 @@ from lxml import etree
 import json
 from bs4 import BeautifulSoup
 from ADC_function import *
-# import sys
+from WebCrawler.storyline import getStoryline
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
 
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
-    soup = BeautifulSoup(htmlcode, 'lxml')
+def getActorPhoto(soup):
     a = soup.find_all(attrs={'class': 'avatar-box'})
     d = {}
     for i in a:
@@ -19,34 +18,28 @@ def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
         p2 = {t: l}
         d.update(p2)
     return d
-def getTitle(a):
+def getTitle(html):
     try:
-        html = etree.fromstring(a, etree.HTMLParser())
         result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
         return result.replace('/', '')
     except:
         return ''
-def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
-    soup = BeautifulSoup(a, 'lxml')
+def getActor(soup):
     a = soup.find_all(attrs={'class': 'avatar-box'})
     d = []
     for i in a:
         d.append(i.span.get_text())
     return d
-def getStudio(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getStudio(html):
     result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
     return result1
-def getRuntime(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getRuntime(html):
     result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
     return result1
-def getLabel(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getLabel(html):
     result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
     return result1
-def getNum(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getNum(html):
     result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
     return result1
 def getYear(release):
@@ -55,28 +48,23 @@ def getYear(release):
         return result
     except:
         return release
-def getRelease(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getRelease(html):
     result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
     return result1
-def getCover(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover(html):
     result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
     return result
-def getCover_small(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover_small(html):
     result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
     return result
-def getTag(a):  # 获取演员
-    soup = BeautifulSoup(a, 'lxml')
+def getTag(soup):  # 获取演员
     a = soup.find_all(attrs={'class': 'genre'})
     d = []
     for i in a:
         d.append(i.get_text())
     return d
-def getSeries(htmlcode):
+def getSeries(html):
     try:
-        html = etree.fromstring(htmlcode, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
         result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
         return result1
     except:
@@ -98,27 +86,30 @@ def main(number):
             result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
     web = get_html("https:" + result1)
     soup = BeautifulSoup(web, 'lxml')
+    web = etree.fromstring(web, etree.HTMLParser())
     info = str(soup.find(attrs={'class': 'row movie'}))
+    info = etree.fromstring(info, etree.HTMLParser())
     try:
         new_number = getNum(info)
         if new_number.upper() != number.upper():
             raise ValueError('number not found')
+        title = getTitle(web).strip(getNum(web))
         dic = {
-            'actor': getActor(web),
-            'title': getTitle(web).strip(getNum(web)),
+            'actor': getActor(soup),
+            'title': title,
             'studio': getStudio(info),
-            'outline': '',  #
+            'outline': getStoryline(number, title),
             'runtime': getRuntime(info),
             'director': '',  #
             'release': getRelease(info),
             'number': new_number,
             'cover': getCover(web),
-            'cover_small': getCover_small(a),
+            'cover_small': getCover_small(html),
             'imagecut': 3,
-            'tag': getTag(web),
+            'tag': getTag(soup),
             'label': getLabel(info),
             'year': getYear(getRelease(info)),  # str(re.search('\d{4}',getRelease(a)).group()),
-            'actor_photo': getActorPhoto(web),
+            'actor_photo': getActorPhoto(soup),
             'website': "https:" + result1,
             'source': 'avsox.py',
             'series': getSeries(info),

From c3e9ab795735df91eb23d4d28f5f66e21d2bd079 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 17:08:00 +0800
Subject: [PATCH 51/56] =?UTF-8?q?avsox.py:=20=E4=BC=98=E5=8C=96:=E5=AE=8C?=
 =?UTF-8?q?=E6=88=90=E7=B2=BE=E7=AE=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/avsox.py | 61 ++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py
index a353690..e38a452 100644
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -3,18 +3,17 @@ sys.path.append('..')
 import re
 from lxml import etree
 import json
-from bs4 import BeautifulSoup
 from ADC_function import *
 from WebCrawler.storyline import getStoryline
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
 
-def getActorPhoto(soup):
-    a = soup.find_all(attrs={'class': 'avatar-box'})
+def getActorPhoto(html):
+    a = html.xpath('//a[@class="avatar-box"]')
     d = {}
     for i in a:
-        l = i.img['src']
-        t = i.span.get_text()
+        l = i.find('.//img').attrib['src']
+        t = i.find('span').text
         p2 = {t: l}
         d.update(p2)
     return d
@@ -24,11 +23,11 @@ def getTitle(html):
         return result.replace('/', '')
     except:
         return ''
-def getActor(soup):
-    a = soup.find_all(attrs={'class': 'avatar-box'})
+def getActor(html):
+    a = html.xpath('//a[@class="avatar-box"]')
     d = []
     for i in a:
-        d.append(i.span.get_text())
+        d.append(i.find('span').text)
     return d
 def getStudio(html):
     result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
@@ -57,12 +56,9 @@ def getCover(html):
 def getCover_small(html):
     result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
     return result
-def getTag(soup):  # 获取演员
-    a = soup.find_all(attrs={'class': 'genre'})
-    d = []
-    for i in a:
-        d.append(i.get_text())
-    return d
+def getTag(html):
+    x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+    return x[2:] if len(x) > 2 else []
 def getSeries(html):
     try:
         result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
@@ -74,45 +70,42 @@ def main(number):
     html = get_html('https://tellme.pw/avsox')
     site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
     a = get_html(site + '/cn/search/' + number)
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+    html = etree.fromstring(a, etree.HTMLParser())
     result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
     if result1 == '' or result1 == 'null' or result1 == 'None':
         a = get_html(site + '/cn/search/' + number.replace('-', '_'))
-        html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+        html = etree.fromstring(a, etree.HTMLParser())
         result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
         if result1 == '' or result1 == 'null' or result1 == 'None':
             a = get_html(site + '/cn/search/' + number.replace('_', ''))
-            html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+            html = etree.fromstring(a, etree.HTMLParser())
             result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
-    web = get_html("https:" + result1)
-    soup = BeautifulSoup(web, 'lxml')
-    web = etree.fromstring(web, etree.HTMLParser())
-    info = str(soup.find(attrs={'class': 'row movie'}))
-    info = etree.fromstring(info, etree.HTMLParser())
+    detail = get_html("https:" + result1)
+    lx = etree.fromstring(detail, etree.HTMLParser())
     try:
-        new_number = getNum(info)
+        new_number = getNum(lx)
         if new_number.upper() != number.upper():
             raise ValueError('number not found')
-        title = getTitle(web).strip(getNum(web))
+        title = getTitle(lx).strip(new_number)
         dic = {
-            'actor': getActor(soup),
+            'actor': getActor(lx),
             'title': title,
-            'studio': getStudio(info),
+            'studio': getStudio(lx),
             'outline': getStoryline(number, title),
-            'runtime': getRuntime(info),
+            'runtime': getRuntime(lx),
             'director': '',  #
-            'release': getRelease(info),
+            'release': getRelease(lx),
             'number': new_number,
-            'cover': getCover(web),
+            'cover': getCover(lx),
             'cover_small': getCover_small(html),
             'imagecut': 3,
-            'tag': getTag(soup),
-            'label': getLabel(info),
-            'year': getYear(getRelease(info)),  # str(re.search('\d{4}',getRelease(a)).group()),
-            'actor_photo': getActorPhoto(soup),
+            'tag': getTag(lx),
+            'label': getLabel(lx),
+            'year': getYear(getRelease(lx)),
+            'actor_photo': getActorPhoto(lx),
             'website': "https:" + result1,
             'source': 'avsox.py',
-            'series': getSeries(info),
+            'series': getSeries(lx),
         }
     except Exception as e:
         if config.getInstance().debug():

From b025c5185270f44672ef43c9aad149459c72aeab Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Tue, 19 Oct 2021 18:40:57 +0800
Subject: [PATCH 52/56] =?UTF-8?q?xcity.py:=E5=B0=9D=E8=AF=95=E8=8E=B7?=
 =?UTF-8?q?=E5=BE=97=E4=B8=AD=E6=96=87=E5=89=A7=E6=83=85=E7=AE=80=E4=BB=8B?=
 =?UTF-8?q?=EF=BC=8C=E6=B2=A1=E6=9C=89=E5=88=99=E7=94=A8=E5=8E=9F=E6=9D=A5?=
 =?UTF-8?q?=E7=9A=84=E3=80=82=E4=BF=AE=E5=A4=8Dtag=E6=95=B0=E7=9B=AE?=
 =?UTF-8?q?=E4=B8=8D=E5=AF=B9=EF=BC=8C=E4=BF=AE=E5=A4=8Druntime=E4=B8=8D?=
 =?UTF-8?q?=E6=98=BE=E7=A4=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/xcity.py | 99 ++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 59 deletions(-)

diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py
index 4bbdec1..6eb208d 100644
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -3,16 +3,12 @@ sys.path.append('../')
 import re
 from lxml import etree
 import json
-from bs4 import BeautifulSoup
 from ADC_function import *
-
-
-# import sys
+from WebCrawler.storyline import getStoryline
 # import io
 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
 
-def getTitle(a):
-    html = etree.fromstring(a, etree.HTMLParser())
+def getTitle(html):
     result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
     return result
 
@@ -43,8 +39,7 @@ def getActorPhoto(browser):
     return o
 
 
-def getStudio(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getStudio(html):
     try:
         result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
     except:
@@ -52,20 +47,14 @@ def getStudio(a):
     return result.strip('+').replace("', '", '').replace('"', '')
 
 
-def getRuntime(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getRuntime(html):
     try:
-        result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
-    except:
-        return ''
-    try:
-        return re.findall('\d+',result1)[0]
+        x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
+        return x
     except:
         return ''
 
-
-def getLabel(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getLabel(html):
     try:
         result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
         return result
@@ -73,8 +62,7 @@ def getLabel(a):
         return ''
 
 
-def getNum(a):
-    html = etree.fromstring(a, etree.HTMLParser())
+def getNum(html):
     try:
         result = html.xpath('//*[@id="hinban"]/text()')[0]
         return result
@@ -90,8 +78,7 @@ def getYear(getRelease):
         return getRelease
 
 
-def getRelease(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getRelease(html):
     try:
         result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
     except:
@@ -102,31 +89,22 @@ def getRelease(a):
         return ''
 
 
-def getTag(a):
-    result2=[]
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
-    result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
-    for i in result1:
-        i=i.replace(u'\n','')
-        i=i.replace(u'\t','')
-        if len(i):
-            result2.append(i)
-    return result2
+def getTag(html):
+    x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
+    return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else []
 
 
-def getCover_small(a, index=0):
+def getCover_small(html, index=0):
     # same issue mentioned below,
     # javdb sometime returns multiple results
     # DO NOT just get the firt one, get the one with correct index number
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
     result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
     if not 'https' in result:
         result = 'https:' + result
     return result
 
 
-def getCover(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover(html):
     try:
         result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
         return 'https:' + result
@@ -134,8 +112,7 @@ def getCover(htmlcode):
         return ''
 
 
-def getDirector(a):
-    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
+def getDirector(html):
     try:
         result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
         return result
@@ -143,19 +120,21 @@ def getDirector(a):
         return ''
 
 
-def getOutline(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getOutline(html, number, title):
+    storyline_site = config.getInstance().storyline_site().split(',')
+    a = set(storyline_site) & {'airav', 'avno1'}
+    if len(a):
+        site = [n for n in storyline_site if n in a]
+        g = getStoryline(number, title, site)
+        if len(g):
+            return g
     try:
-        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0]
+        x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
+        return x.replace(getNum(html), '')
     except:
         return ''
-    try:
-        return re.sub('\\\\\w*\d+','',result)
-    except:
-        return result
 
-def getSeries(htmlcode):
-    html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getSeries(html):
     try:
         try:
             result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
@@ -198,33 +177,35 @@ def main(number):
     try:
         detail_page, browser = open_by_browser(number)
         url = browser.url
-        newnum = getNum(detail_page).upper()
+        lx = etree.fromstring(detail_page, etree.HTMLParser())
+        newnum = getNum(lx).upper()
         number_up = number.upper()
         if newnum != number_up:
             if newnum == number.replace('-','').upper():
                 newnum = number_up
             else:
                 raise ValueError("xcity.py: number not found")
+        title = getTitle(lx)
         dic = {
             'actor': getActor(browser),
-            'title': getTitle(detail_page),
-            'studio': getStudio(detail_page),
-            'outline': getOutline(detail_page),
-            'runtime': getRuntime(detail_page),
-            'director': getDirector(detail_page),
-            'release': getRelease(detail_page),
+            'title': title,
+            'studio': getStudio(lx),
+            'outline': getOutline(lx, number, title),
+            'runtime': getRuntime(lx),
+            'director': getDirector(lx),
+            'release': getRelease(lx),
             'number': newnum,
-            'cover': getCover(detail_page),
+            'cover': getCover(lx),
             'cover_small': '',
             'extrafanart': getExtrafanart(detail_page),
             'imagecut': 1,
-            'tag': getTag(detail_page),
-            'label': getLabel(detail_page),
-            'year': getYear(getRelease(detail_page)),  # str(re.search('\d{4}',getRelease(a)).group()),
+            'tag': getTag(lx),
+            'label': getLabel(lx),
+            'year': getYear(getRelease(lx)),  # str(re.search('\d{4}',getRelease(a)).group()),
 #            'actor_photo': getActorPhoto(browser),
             'website': url,
             'source': 'xcity.py',
-            'series': getSeries(detail_page),
+            'series': getSeries(lx),
         }
     except Exception as e:
         if config.getInstance().debug():

From cb83e4246db7ea503ba9f3ef9bccec2274e14ac7 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Wed, 20 Oct 2021 03:34:44 +0800
Subject: [PATCH 53/56] =?UTF-8?q?=E6=97=A0=E7=A0=81=E6=A3=80=E6=B5=8B?=
 =?UTF-8?q?=E7=A7=BB=E5=85=A5number=5Fparser.py=E5=B9=B6=E6=89=A9=E5=85=85?=
 =?UTF-8?q?=E8=AF=86=E5=88=AB=E8=83=BD=E5=8A=9B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ADC_function.py  | 11 -----------
 config.ini       |  3 +--
 core.py          |  2 +-
 number_parser.py | 34 +++++++++++++++++++++++++++++++++-
 4 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/ADC_function.py b/ADC_function.py
index 5b1d507..12fecce 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -566,17 +566,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
     return trans_result
 
 
-# ========================================================================是否为无码
-def is_uncensored(number):
-    if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper():
-        return True
-    configs = config.getInstance().get_uncensored()
-    prefix_list = str(configs).split(',')
-    for pre in prefix_list:
-        if pre.upper() in number.upper():
-            return True
-    return False
-
 # 从浏览器中导出网站登录验证信息的cookies，能够以会员方式打开游客无法访问到的页面
 # 示例: FC2-755670 url https://javdb9.com/v/vO8Mn
 # json 文件格式
diff --git a/config.ini b/config.ini
index 700fa95..b4d9fb4 100755
--- a/config.ini
+++ b/config.ini
@@ -65,8 +65,7 @@ switch=0
 
 ; 用来确定是否是无码
 [uncensored]
-uncensored_prefix=S2M,BT,LAF,SMD
-
+uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,RED
 
 [media]
 ; 影片后缀
diff --git a/core.py b/core.py
index ae73af8..d7066f4 100755
--- a/core.py
+++ b/core.py
@@ -12,7 +12,7 @@ from datetime import datetime
 
 from ADC_function import *
 from WebCrawler import get_data_from_json
-
+from number_parser import is_uncensored
 
 def escape_path(path, escape_literals: str):  # Remove escape literals
     backslash = '\\'
diff --git a/number_parser.py b/number_parser.py
index 616af85..212c2c0 100755
--- a/number_parser.py
+++ b/number_parser.py
@@ -1,6 +1,7 @@
 import os
 import re
 import sys
+import config
 
 G_spat = re.compile(
     "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
@@ -82,6 +83,37 @@ def get_number_by_dict(filename: str) -> str:
         pass
     return None
 
+class Cache_uncensored_conf:
+    prefix = None
+    def is_empty(self):
+        return bool(self.prefix is None)
+    def set(self, v: list):
+        if not v or not len(v) or not len(v[0]):
+            raise ValueError('input prefix list empty or None')
+        s = v[0]
+        if len(v) > 1:
+            for i in v[1:]:
+                s += f"|{i}.+"
+        self.prefix = re.compile(s, re.I)
+    def check(self, number):
+        if self.prefix is None:
+            raise ValueError('No init re compile')
+        return self.prefix.match(number)
+
+G_cache_uncensored_conf = Cache_uncensored_conf()
+
+# ========================================================================是否为无码
+def is_uncensored(number):
+    if re.match(
+r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
+        number,
+        re.I
+    ):
+        return True
+    if G_cache_uncensored_conf.is_empty():
+        G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
+    return G_cache_uncensored_conf.check(number)
+
 if __name__ == "__main__":
 #     import doctest
 #     doctest.testmod(raise_on_error=True)
@@ -164,7 +196,7 @@ if __name__ == "__main__":
         try:
             n = get_number(True, filename)
             if n:
-                print(f'  [{n}] # {filename}')
+                print('  [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else ''))
             else:
                 print(f'[-]Number return None. # {filename}')
         except Exception as e:

From c44031548809cb1a53cc5e10c81adcd5c500a3e3 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Wed, 20 Oct 2021 23:07:37 +0800
Subject: [PATCH 54/56] =?UTF-8?q?=E7=BF=BB=E8=AF=91=E5=89=8D=E6=A3=80?=
 =?UTF-8?q?=E6=9F=A5=E8=AF=AD=E8=A8=80=EF=BC=8C=E5=B7=B2=E7=BB=8F=E6=98=AF?=
 =?UTF-8?q?=E4=B8=AD=E6=96=87=E4=BA=86=E4=B8=8D=E5=BF=85=E7=BF=BB=E8=AF=91?=
 =?UTF-8?q?=EF=BC=8C=E5=8F=AA=E7=BF=BB=E8=AF=91=E6=97=A5=E8=AF=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ADC_function.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ADC_function.py b/ADC_function.py
index 12fecce..36be657 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -511,6 +511,9 @@ def translate(
         delay: int = 0,
 ):
     trans_result = ""
+    # 中文句子如果包含&等符号会被谷歌翻译截断损失内容，而且中文翻译到中文也没有意义，故而忽略，只翻译带有日语假名的
+    if not is_japanese(src):
+        return src
     if engine == "google-free":
         gsite = config.getInstance().get_translate_service_site()
         if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite):
@@ -620,3 +623,7 @@ def file_modification_days(filename) -> int:
 
 def file_not_exist_or_empty(filepath) -> bool:
     return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0
+
+# 日语简单检测
+def is_japanese(s) -> bool:
+    return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))

From 1f9bf6b4c283c804252d2ebd8e0c9fc123d5dde5 Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Thu, 21 Oct 2021 19:57:09 +0800
Subject: [PATCH 55/56] =?UTF-8?q?=E6=97=A5=E5=BF=97=E5=90=88=E5=B9=B6:?=
 =?UTF-8?q?=E4=B8=89=E5=A4=A9=E4=B9=8B=E5=89=8D=E7=9A=84=E6=97=A5=E5=BF=97?=
 =?UTF-8?q?=EF=BC=8C=E5=90=88=E5=B9=B6=E4=B8=BA=E5=8D=95=E6=97=A5=E5=8D=95?=
 =?UTF-8?q?=E4=B8=AA=E6=96=87=E4=BB=B6=EF=BC=8C=E4=BB=A5=E8=A7=A3=E5=86=B3?=
 =?UTF-8?q?=E5=A2=9E=E9=87=8F=E5=A4=84=E7=90=86=E6=97=B6=E5=B0=8F=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E8=BF=87=E5=A4=9A=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AV_Data_Capture.py | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index 6c13e5d..e87be03 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -162,14 +162,15 @@ def close_logfile(logdir: str):
                 f.unlink(missing_ok=True)
             except:
                 pass
-    # 合并日志 只检测日志目录内的文本日志，忽略子目录。三个月前的日志，按月合并为一个月志，
-    # 去年及以前的月志，今年4月以后将之按年合并为年志
+    # 合并日志 只检测日志目录内的文本日志，忽略子目录。三天前的日志，按日合并为单个日志，三个月前的日志，
+    # 按月合并为单个月志，去年及以前的月志，今年4月以后将之按年合并为年志
     # 测试步骤：
     """
     LOGDIR=/tmp/avlog
     mkdir -p $LOGDIR
     for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done
     for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done
+    for f in {00..23};do;echo 20211001T$f>$LOGDIR/avdc_20211001T${f}5959.txt;done
     echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR"
     # 1932 files in /tmp/avlog
     avdc -zgic1 -d0 -m3 -o $LOGDIR
@@ -177,19 +178,40 @@ def close_logfile(logdir: str):
     ls $LOGDIR
     # rm -rf $LOGDIR
     """
-    # 第一步，合并到月
     today = datetime.today()
+    # 第一步，合并到日。3天前的日志，文件名是同一天的合并为一份日志
+    for i in range(1):
+        txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}T\d{6}$', f.stem, re.A)]
+        if not txts or not len(txts):
+            break
+        e = [f for f in txts if '_err' in f.stem]
+        txts.sort()
+        tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99")
+        deadline_day = f'avdc_{tmstr_3_days_ago}'
+        day_merge = [f for f in txts if f.stem < deadline_day]
+        if not day_merge or not len(day_merge):
+            break
+        cutday = len('T235959.txt')  # cut length avdc_20201201|T235959.txt
+        for f in day_merge:
+            try:
+                day_file_name = str(f)[:-cutday] + '.txt' # avdc_20201201.txt
+                with open(day_file_name, 'a', encoding='utf-8') as m:
+                    m.write(f.read_text(encoding='utf-8'))
+                f.unlink(missing_ok=True)
+            except:
+                pass
+    # 第二步，合并到月
     for i in range(1):  # 利用1次循环的break跳到第二步，避免大块if缩进或者使用goto语法
-        txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{8}T\d{6}', f.stem, re.A)]
+        txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}$', f.stem, re.A)]
         if not txts or not len(txts):
             break
         txts.sort()
-        tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32T")
+        tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32")
         deadline_month = f'avdc_{tmstr_3_month_ago}'
         month_merge = [f for f in txts if f.stem < deadline_month]
         if not month_merge or not len(month_merge):
             break
-        tomonth = len('01T235959.txt')  # cut length avdc_202012|01T235959.txt
+        tomonth = len('01.txt')  # cut length avdc_202012|01.txt
         for f in month_merge:
             try:
                 month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt
@@ -198,10 +220,10 @@ def close_logfile(logdir: str):
                 f.unlink(missing_ok=True)
             except:
                 pass
-    # 第二步，月合并到年
+    # 第三步，月合并到年
     if today.month < 4:
         return
-    mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'avdc_\d{6}', f.stem, re.A)]
+    mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{6}$', f.stem, re.A)]
     if not mons or not len(mons):
         return
     mons.sort()
@@ -218,7 +240,7 @@ def close_logfile(logdir: str):
             f.unlink(missing_ok=True)
         except:
             pass
-    # 第三步，压缩年志 如果有压缩需求，请自行手工压缩，或者使用外部脚本来定时完成。推荐nongnu的lzip，对于
+    # 第四步，压缩年志 如果有压缩需求，请自行手工压缩，或者使用外部脚本来定时完成。推荐nongnu的lzip，对于
     # 这种粒度的文本日志，压缩比是目前最好的。lzip -9的运行参数下，日志压缩比要高于xz -9，而且内存占用更少，
     # 多核利用率更高(plzip多线程版本)，解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右，
     # 100MB的日志文件能缩小到3.7MB。

From 850679705ee6eacaedeb37b2c2405fcf315b947a Mon Sep 17 00:00:00 2001
From: lededev <lededev@noreplay.github.com>
Date: Thu, 21 Oct 2021 20:02:07 +0800
Subject: [PATCH 56/56] =?UTF-8?q?=E5=89=A7=E6=83=85=E7=AE=80=E4=BB=8B:?=
 =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=97=A0=E7=A0=81=E5=85=83=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E7=AB=99=E7=82=B9=EF=BC=8C=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E6=94=B9=E4=B8=BA=E9=80=9A=E7=94=A8=E3=80=81=E6=9C=89=E7=A0=81?=
 =?UTF-8?q?=E3=80=81=E6=97=A0=E7=A0=81=E4=B8=89=E7=A7=8D=E7=AB=99=E7=82=B9?=
 =?UTF-8?q?=E5=88=86=E5=88=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebCrawler/carib.py     | 11 ++-----
 WebCrawler/storyline.py | 71 +++++++++++++++++++++++++++++++++++------
 WebCrawler/xcity.py     |  2 +-
 config.ini              | 13 +++++---
 config.py               | 18 +++++++++--
 core.py                 |  5 +--
 number_parser.py        |  2 +-
 7 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
index 3e583df..790b910 100755
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -60,14 +60,9 @@ def get_year(lx: html.HtmlElement) -> str:
 
 def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
     o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
-
-    storyline_site = config.getInstance().storyline_site().split(',')
-    a = set(storyline_site) & {'airav', 'avno1'}
-    if len(a):
-        site = [n for n in storyline_site if n in a]
-        g = getStoryline(number, title, site)
-        if len(g):
-            return g
+    g = getStoryline(number, title)
+    if len(g):
+        return g
     return o
 
 def get_release(lx: html.HtmlElement) -> str:
diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py
index 5c2b91a..9b0a44c 100644
--- a/WebCrawler/storyline.py
+++ b/WebCrawler/storyline.py
@@ -8,8 +8,9 @@ from multiprocessing import Pool
 from multiprocessing.dummy import Pool as ThreadPool
 from difflib import SequenceMatcher
 from unicodedata import category
+from number_parser import is_uncensored
 
-G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon"}
+G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"}
 
 G_mode_txt = ('顺序执行','线程池','进程池')
 
@@ -28,7 +29,16 @@ def getStoryline(number, title, sites: list=None):
     conf = config.getInstance()
     debug = conf.debug() or conf.storyline_show() == 2
     storyine_sites = conf.storyline_site().split(',') if sites is None else sites
-    apply_sites = [ s for s in storyine_sites if s in G_registered_storyline_site]
+    if is_uncensored(number):
+        storyine_sites += conf.storyline_uncensored_site().split(',')
+    else:
+        storyine_sites += conf.storyline_censored_site().split(',')
+    r_dup = set()
+    apply_sites = []
+    for s in storyine_sites:
+        if s in G_registered_storyline_site and s not in r_dup:
+            apply_sites.append(s)
+            r_dup.add(s)
     mp_args = ((site, number, title, debug) for site in apply_sites)
     cores = min(len(apply_sites), os.cpu_count())
     if cores == 0:
@@ -80,6 +90,8 @@ def _getStoryline_mp(site, number, title, debug):
         storyline = getStoryline_xcity(number, debug)
     elif site == "amazon":
         storyline = getStoryline_amazon(title, number, debug)
+    elif site == "58avgo":
+        storyline = getStoryline_58avgo(number, debug)
     if not debug:
         return storyline
     print("[!]MP 进程[{}]运行{:.3f}秒，结束于{}返回结果: {}".format(
@@ -119,24 +131,63 @@ def getStoryline_airav(number, debug):
     return None
 
 
+def getStoryline_58avgo(number, debug):
+    try:
+        url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
+                '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
+                '?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
+        ]) # 随机选一个，避免网站httpd日志中单个ip的请求太过单一
+        kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
+        result, browser = get_html_by_form(url,
+            fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
+            return_type = 'browser')
+        if not result.ok:
+            raise ValueError(f"get_html_by_form('{url}','{number}') failed")
+        if f'searchresults.aspx?Search={kwd}' not in browser.url:
+            raise ValueError("number not found")
+        s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
+        link = None
+        for i in range(len(s)):
+            title = s[i].h3.text.strip()
+            if re.search(number, title, re.I):
+                link = s[i]
+                break;
+        if link is None:
+            raise ValueError("number not found")
+        result = browser.follow_link(link)
+        if not result.ok or 'playon.aspx' not in browser.url:
+            raise ValueError("detail page not found")
+        title = browser.page.select('head > title')[0].text.strip()
+        detail_number = str(re.findall('\[(.*?)]', title)[0])
+        if not re.search(number, detail_number, re.I):
+            raise ValueError("detail page number not match, got ->[{detail_number}]")
+        return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip()
+    except Exception as e:
+        if debug:
+            print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
+        pass
+    return ''
+
+
 def getStoryline_avno1(number, debug):  #获取剧情介绍 从avno1.cc取得
     try:
         url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
                 secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
                 '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
         ]) # 随机选一个，避免网站httpd日志中单个ip的请求太过单一
-        number_up = number.upper()
         result, browser = get_html_by_form(url,
             form_select='div.wrapper > div.header > div.search > form',
-            fields = {'kw' : number_up},
+            fields = {'kw' : number},
             return_type = 'browser')
         if not result.ok:
-            raise ValueError(f"get_html_by_form('{url}','{number_up}') failed")
-        title = browser.page.select('div.type_movie > div > ul > li > div > a > h3')[0].text.strip()
-        page_number = title[title.rfind(' '):].upper()
-        if not number_up in page_number:
-            raise ValueError(f"page number ->[{page_number}] not match")
-        return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip()
+            raise ValueError(f"get_html_by_form('{url}','{number}') failed")
+        s = browser.page.select('div.type_movie > div > ul > li > div')
+        for i in range(len(s)):
+            title = s[i].a.h3.text.strip()
+            page_number = title[title.rfind(' '):].strip()
+            if re.search(number, page_number, re.I):
+                return s[i]['data-description'].strip()
+        raise ValueError(f"page number ->[{page_number}] not match")
     except Exception as e:
         if debug:
             print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py
index 6eb208d..ed381e7 100644
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -122,7 +122,7 @@ def getDirector(html):
 
 def getOutline(html, number, title):
     storyline_site = config.getInstance().storyline_site().split(',')
-    a = set(storyline_site) & {'airav', 'avno1'}
+    a = set(storyline_site) & {'airav', 'avno1'}  # 只要中文的简介文字
     if len(a):
         site = [n for n in storyline_site if n in a]
         g = getStoryline(number, title, site)
diff --git a/config.ini b/config.ini
index b4d9fb4..eef14db 100755
--- a/config.ini
+++ b/config.ini
@@ -86,11 +86,16 @@ extrafanart_folder=extrafanart
 
 ; 剧情简介
 [storyline]
-; website为javbus或javdb时，site为获取剧情简介信息的可选数据源站点列表。列表内站点同时并发查询，取值优先级
-; 从左到右，靠左站点没数据才会采用后面站点获得的。其中airav和avno1是中文剧情简介，xcity和amazon是日语的，由
-; 于amazon商城没有番号信息，选中对应DVD的准确率仅99.6%。如果列表为空则不查询，设置成不查询可大幅提高刮削速度。
+; website为javbus javdb avsox xcity carib时，site censored_site uncensored_site 为获取剧情简介信息的
+; 可选数据源站点列表。列表内站点同时并发查询，取值优先级从左到右，靠左站点没数据才会采用后面站点获得的。
+; 其中airav avno1 58avgo是中文剧情简介，区别是airav只能查有码，avno1有码无码都能查，58avgo只能查无码或者
+; 流出破解马赛克的影片(此功能没使用)。
+; xcity和amazon是日语的，由于amazon商城没有番号信息，选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询，
+; 设置成不查询可大幅提高刮削速度。
 ; site=
-site=airav,avno1,xcity,amazon
+site=avno1
+censored_site=airav,xcity,amazon
+uncensored_site=58avgo
 ; 运行模式：0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大，并发站点越多越快)
 run_mode=1
 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)，剧情简介失效时可打开2查看原因
diff --git a/config.py b/config.py
index 83a36bc..f6d6488 100644
--- a/config.py
+++ b/config.py
@@ -243,7 +243,19 @@ class Config:
         try:
             return self.conf.get("storyline", "site")
         except:
-            return "airav,avno1,xcity,amazon"
+            return "avno1"
+
+    def storyline_censored_site(self) -> str:
+        try:
+            return self.conf.get("storyline", "censored_site")
+        except:
+            return "airav,xcity,amazon"
+
+    def storyline_uncensored_site(self) -> str:
+        try:
+            return self.conf.get("storyline", "uncensored_site")
+        except:
+            return "58avgo"
 
     def storyline_show(self) -> int:
         try:
@@ -354,7 +366,9 @@ class Config:
 
         sec14 = "storyline"
         conf.add_section(sec14)
-        conf.set(sec14, "site", "airav,avno1,xcity,amazon")
+        conf.set(sec14, "site", "avno1")
+        conf.set(sec14, "censored_site", "airav,xcity,amazon")
+        conf.set(sec14, "uncensored_site", "58avgo")
         conf.set(sec14, "show_result", 0)
         conf.set(sec14, "run_mode", 1)
 
diff --git a/core.py b/core.py
index d7066f4..24c1ce5 100755
--- a/core.py
+++ b/core.py
@@ -566,10 +566,7 @@ def core_main(file_path, number_th):
         c_word = '-C'  # 中文字幕影片后缀
 
     # 判断是否无码
-    if is_uncensored(number):
-        uncensored = 1
-    else:
-        uncensored = 0
+    uncensored = 1 if is_uncensored(number) else 0
 
 
     if '流出' in filepath or 'uncensored' in filepath:
diff --git a/number_parser.py b/number_parser.py
index 212c2c0..4d4fe93 100755
--- a/number_parser.py
+++ b/number_parser.py
@@ -71,7 +71,7 @@ G_TAKE_NUM_RULES = {
     '10mu'  : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
     'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
     'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
-    'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[-|_]{1}(\d{3,4})[^\d]*', x, re.I)[0])
+    'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0])
 }
 
 def get_number_by_dict(filename: str) -> str: