PEP8 PREFIX, AND SOME TYPING ANNOTATION
This commit is contained in:
121
number_parser.py
121
number_parser.py
@@ -2,36 +2,37 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import config
|
||||
import typing
|
||||
|
||||
G_spat = re.compile(
|
||||
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@|-uncensored|_uncensored|-leak|_leak",
|
||||
re.IGNORECASE)
|
||||
|
||||
|
||||
def get_number(debug,file_path: str) -> str:
|
||||
# """
|
||||
# >>> from number_parser import get_number
|
||||
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829-C.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("C:¥Users¥Guest¥snis-829.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("C:¥Users¥Guest¥snis-829-C.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("./snis-829.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("./snis-829-C.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number(".¥snis-829.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number(".¥snis-829-C.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("snis-829.mp4")
|
||||
# 'snis-829'
|
||||
# >>> get_number("snis-829-C.mp4")
|
||||
# 'snis-829'
|
||||
# """
|
||||
def get_number(debug: bool, file_path: str) -> str:
|
||||
"""
|
||||
从文件路径中提取番号 from number_parser import get_number
|
||||
>>> get_number(False, "/Users/Guest/AV_Data_Capture/snis-829.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "/Users/Guest/AV_Data_Capture/snis-829-C.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "C:¥Users¥Guest¥snis-829.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "C:¥Users¥Guest¥snis-829-C.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "./snis-829.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "./snis-829-C.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, ".¥snis-829.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, ".¥snis-829-C.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "snis-829.mp4")
|
||||
'snis-829'
|
||||
>>> get_number(False, "snis-829-C.mp4")
|
||||
'snis-829'
|
||||
"""
|
||||
filepath = os.path.basename(file_path)
|
||||
# debug True 和 False 两块代码块合并,原因是此模块及函数只涉及字符串计算,没有IO操作,debug on时输出导致异常信息即可
|
||||
try:
|
||||
@@ -56,7 +57,7 @@ def get_number(debug,file_path: str) -> str:
|
||||
try:
|
||||
return str(
|
||||
re.findall(r'(.+?)\.',
|
||||
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
|
||||
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
|
||||
"['']").replace('_', '-')
|
||||
except:
|
||||
return str(re.search(r'(.+?)\.', filepath)[0])
|
||||
@@ -68,29 +69,33 @@ def get_number(debug,file_path: str) -> str:
|
||||
|
||||
# 按javdb数据源的命名规范提取number
|
||||
G_TAKE_NUM_RULES = {
|
||||
'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
|
||||
'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
|
||||
'1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
|
||||
'10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
|
||||
'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
|
||||
'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
|
||||
'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]),
|
||||
'heyzo' : lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0]
|
||||
'tokyo.*hot': lambda x: str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
|
||||
'carib': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
|
||||
'1pon|mura|paco': lambda x: str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
|
||||
'10mu': lambda x: str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
|
||||
'x-art': lambda x: str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
|
||||
'xxx-av': lambda x: ''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
|
||||
'heydouga': lambda x: 'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]),
|
||||
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0]
|
||||
}
|
||||
|
||||
def get_number_by_dict(filename: str) -> str:
|
||||
|
||||
def get_number_by_dict(filename: str) -> typing.Optional[str]:
|
||||
try:
|
||||
for k,v in G_TAKE_NUM_RULES.items():
|
||||
for k, v in G_TAKE_NUM_RULES.items():
|
||||
if re.search(k, filename, re.I):
|
||||
return v(filename)
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
class Cache_uncensored_conf:
|
||||
prefix = None
|
||||
|
||||
def is_empty(self):
|
||||
return bool(self.prefix is None)
|
||||
|
||||
def set(self, v: list):
|
||||
if not v or not len(v) or not len(v[0]):
|
||||
raise ValueError('input prefix list empty or None')
|
||||
@@ -99,28 +104,32 @@ class Cache_uncensored_conf:
|
||||
for i in v[1:]:
|
||||
s += f"|{i}.+"
|
||||
self.prefix = re.compile(s, re.I)
|
||||
|
||||
def check(self, number):
|
||||
if self.prefix is None:
|
||||
raise ValueError('No init re compile')
|
||||
return self.prefix.match(number)
|
||||
|
||||
|
||||
G_cache_uncensored_conf = Cache_uncensored_conf()
|
||||
|
||||
|
||||
# ========================================================================是否为无码
|
||||
def is_uncensored(number):
|
||||
if re.match(
|
||||
r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
|
||||
number,
|
||||
re.I
|
||||
r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
|
||||
number,
|
||||
re.I
|
||||
):
|
||||
return True
|
||||
if G_cache_uncensored_conf.is_empty():
|
||||
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
|
||||
return G_cache_uncensored_conf.check(number)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# import doctest
|
||||
# doctest.testmod(raise_on_error=True)
|
||||
# import doctest
|
||||
# doctest.testmod(raise_on_error=True)
|
||||
test_use_cases = (
|
||||
"MEYD-594-C.mp4",
|
||||
"SSIS-001_C.mp4",
|
||||
@@ -131,26 +140,30 @@ if __name__ == "__main__":
|
||||
"SDDE-625_uncensored_C.mp4",
|
||||
"SDDE-625_uncensored_leak_C.mp4",
|
||||
"SDDE-625_uncensored_leak_C_cd1.mp4",
|
||||
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
|
||||
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
|
||||
"TokyoHot-n1287-HD SP2006 .mp4",
|
||||
"caribean-020317_001.nfo", # -号误命名为_号的
|
||||
"caribean-020317_001.nfo", # -号误命名为_号的
|
||||
"257138_3xplanet_1Pondo_080521_001.mp4",
|
||||
"ADV-R0624-CD3.wmv", # 多碟影片
|
||||
"XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源
|
||||
"ADV-R0624-CD3.wmv", # 多碟影片
|
||||
"XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源
|
||||
"xxx-av 20589.mp4",
|
||||
"Muramura-102114_145-HD.wmv", # 新支持片商格式 102114_145 命名规则来自javdb数据源
|
||||
"heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
|
||||
"HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
|
||||
"pacopacomama-093021_539-FHD.mkv", # 新支持片商格式 093021_539 命名规则来自javdb数据源
|
||||
"heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
|
||||
"HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
|
||||
"pacopacomama-093021_539-FHD.mkv", # 新支持片商格式 093021_539 命名规则来自javdb数据源
|
||||
"sbw99.cc@heyzo_hd_2636_full.mp4"
|
||||
)
|
||||
|
||||
|
||||
def evprint(evstr):
|
||||
code = compile(evstr, "<string>", "eval")
|
||||
print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
|
||||
|
||||
|
||||
for t in test_use_cases:
|
||||
evprint(f'get_number(True, "{t}")')
|
||||
|
||||
if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
|
||||
if len(sys.argv) <= 1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
|
||||
sys.exit(0)
|
||||
|
||||
# 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据,参数为盘符 A .. Z 或带盘符路径
|
||||
@@ -169,6 +182,7 @@ if __name__ == "__main__":
|
||||
# 示例:
|
||||
# python3 ./number_parser.py ALL
|
||||
import subprocess
|
||||
|
||||
ES_search_path = "ALL disks"
|
||||
if sys.argv[1] == "ALL":
|
||||
if sys.platform == "win32":
|
||||
@@ -176,18 +190,19 @@ if __name__ == "__main__":
|
||||
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
|
||||
ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v'
|
||||
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
|
||||
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
|
||||
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
|
||||
out_list = out_text.splitlines()
|
||||
elif sys.platform in ("linux", "darwin"):
|
||||
ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
|
||||
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path)
|
||||
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(
|
||||
ES_prog_path)
|
||||
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
|
||||
out_text = out_bytes.decode('utf-8')
|
||||
out_list = [ os.path.basename(line) for line in out_text.splitlines()]
|
||||
out_list = [os.path.basename(line) for line in out_text.splitlines()]
|
||||
else:
|
||||
print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.')
|
||||
sys.exit(1)
|
||||
else: # Windows single disk
|
||||
else: # Windows single disk
|
||||
if sys.platform != "win32":
|
||||
print('[!]Usage: python3 ./number_parser.py ALL')
|
||||
sys.exit(0)
|
||||
@@ -202,7 +217,7 @@ if __name__ == "__main__":
|
||||
ES_search_path = os.path.normcase(ES_search_path)
|
||||
ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v'
|
||||
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
|
||||
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
|
||||
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
|
||||
out_list = out_text.splitlines()
|
||||
print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...')
|
||||
print(f'[+]Find {len(out_list)} Movies.')
|
||||
|
||||
Reference in New Issue
Block a user