Merge remote-tracking branch 'origin/master'

# Conflicts:
#	.github/workflows/main.yml
This commit is contained in:
yoshiko2
2022-02-26 05:12:04 +08:00
7 changed files with 383 additions and 246 deletions

View File

@@ -11,6 +11,7 @@ import time
from lxml import etree
import re
import config
import typing
from urllib.parse import urljoin
import mechanicalsoup
from requests.adapters import HTTPAdapter
@@ -25,10 +26,13 @@ def getXpathSingle(htmlcode, xpath):
return result1
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
# 网页请求核心
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
"""
网页请求核心函数
"""
verify = config.getInstance().cacert_file()
configProxy = config.getInstance().proxy()
errors = ""
@@ -39,7 +43,8 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
try:
if configProxy.enable:
proxies = configProxy.proxies()
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies, verify=verify,
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies,
verify=verify,
cookies=cookies)
else:
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
@@ -91,6 +96,7 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
G_DEFAULT_TIMEOUT = 10 # seconds
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
self.timeout = G_DEFAULT_TIMEOUT
@@ -98,6 +104,7 @@ class TimeoutHTTPAdapter(HTTPAdapter):
self.timeout = kwargs["timeout"]
del kwargs["timeout"]
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
timeout = kwargs.get("timeout")
if timeout is None:
@@ -106,12 +113,14 @@ class TimeoutHTTPAdapter(HTTPAdapter):
# with keep-alive feature
def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
def get_html_session(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None):
configProxy = config.getInstance().proxy()
session = requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable:
@@ -142,12 +151,14 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur
return None
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, use_scraper: bool = False):
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, use_scraper: bool = False):
configProxy = config.getInstance().proxy()
s = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable:
@@ -178,12 +189,14 @@ def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, re
return None
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
return_type: str = None, encoding: str = None):
configProxy = config.getInstance().proxy()
s = requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable:
@@ -216,12 +229,14 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies:
return None
def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None):
configProxy = config.getInstance().proxy()
session = create_scraper(browser={'custom': ua or G_USER_AGENT, })
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable:
@@ -285,7 +300,12 @@ def translate(
app_id: str = "",
key: str = "",
delay: int = 0,
):
) -> str:
"""
translate japanese kana to simplified chinese
翻译日语假名到简体中文
:raises ValueError: Non-existent translation engine
"""
trans_result = ""
# 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的
if not is_japanese(src):
@@ -324,26 +344,27 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
return trans_result
# 从浏览器中导出网站登录验证信息的cookies能够以会员方式打开游客无法访问到的页面
def load_cookies(cookie_json_filename: str):
"""
加载cookie,用于以会员方式访问非游客内容
:filename: cookie文件名。获取cookie方式从网站登录后通过浏览器插件(CookieBro或EdittThisCookie)或者直接在地址栏网站链接信息处都可以复制或者导出cookie内容以JSON方式保存
# 示例: FC2-755670 url https://javdb9.com/v/vO8Mn
# json 文件格式
# 文件名: 站点名.json示例 javdb9.json
# 内容(文件编码:UTF-8)
'''
{
"over18":"1",
"redirect_to":"%2Fv%2FvO8Mn",
"remember_me_token":"cbJdeaFpbHMiOnsibWVzc2FnZSI6IklrNVJjbTAzZFVSRVlVaEtPWEpUVFhOVU0yNXhJZz09IiwiZXhwIjoiMjAyMS0wNS0xNVQxMzoyODoxNy4wMDBaIiwicHVyIjoiY29va2llLnJlbWVtYmVyX21lX3Rva2VuIn19--a7131611e844cf75f9db4cd411b635889bff3fe3",
"_jdb_session":"asddefqfwfwwrfdsdaAmqKj1%2FvOrDQP4b7h%2BvGp7brvIShi2Y%2FHBUr%2BklApk06TfhBOK3g5gRImZzoi49GINH%2FK49o3W%2FX64ugBiUAcudN9b27Mg6Ohu%2Bx9Z7A4bbqmqCt7XR%2Bao8PRuOjMcdDG5czoYHJCPIPZQFU28Gd7Awc2jc5FM5CoIgSRyaYDy9ulTO7DlavxoNL%2F6OFEL%2FyaA6XUYTB2Gs1kpPiUDqwi854mo5%2FrNxMhTeBK%2BjXciazMtN5KlE5JIOfiWAjNrnx7SV3Hj%2FqPNxRxXFQyEwHr5TZa0Vk1%2FjbwWQ0wcIFfh%2FMLwwqKydAh%2FLndc%2Bmdv3e%2FJ%2BiL2--xhqYnMyVRlxJajdN--u7nl0M7Oe7tZtPd4kIaEbg%3D%3D",
"remember_me_token":"***********",
"_jdb_session":"************",
"locale":"zh",
"__cfduid":"dee27116d98c432a5cabc1fe0e7c2f3c91620479752",
"__cfduid":"*********",
"theme":"auto"
}
'''
# 从网站登录后,通过浏览器插件(CookieBro或EdittThisCookie)或者直接在地址栏网站链接信息处都可以复制或者导出cookie内容
# 并填写到以上json文件的相应字段中
def load_cookies(filename):
filename = os.path.basename(filename)
"""
filename = os.path.basename(cookie_json_filename)
if not len(filename):
return None, None
path_search_order = (
@@ -364,8 +385,11 @@ def load_cookies(filename):
except:
return None, None
# 文件修改时间距此时的天数
def file_modification_days(filename) -> int:
def file_modification_days(filename: str) -> int:
"""
文件修改时间距此时的天数
"""
mfile = Path(filename)
if not mfile.is_file():
return 9999
@@ -376,18 +400,24 @@ def file_modification_days(filename) -> int:
return 9999
return days
def file_not_exist_or_empty(filepath) -> bool:
return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0
# 日语简单检测
def is_japanese(s) -> bool:
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))
def is_japanese(raw: str) -> bool:
"""
日语简单检测
"""
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE))
# Usage: python ./ADC_function.py https://cn.bing.com/
if __name__ == "__main__":
import sys, timeit
from http.client import HTTPConnection
def benchmark(t, url):
print(f"HTTP GET Benchmark times:{t} url:{url}")
tm = timeit.timeit(f"_ = session1.get('{url}')",
@@ -406,7 +436,10 @@ if __name__ == "__main__":
"from __main__ import get_html",
number=t)
print(f' *{tm:>10.5f}s get_html()')
t = 100
# url = "https://www.189.cn/"
url = "http://www.chinaunicom.com"
HTTPConnection.debuglevel = 1
@@ -417,7 +450,11 @@ if __name__ == "__main__":
url = sys.argv[1]
benchmark(t, url)
def download_file_with_filename(url, filename, path):
def download_file_with_filename(url: str, filename: str, path: str) -> None:
"""
download file save to give path with given name from given url
"""
conf = config.getInstance()
configProxy = conf.proxy()
@@ -475,26 +512,39 @@ def download_file_with_filename(url, filename, path):
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
return
def download_one_file(args):
def download_one_file(args) -> str:
"""
download file save to given path from given url
wrapped for map function
"""
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
'''用法示例: 2线程同时下载两个不同文件并保存到不同路径路径目录可未创建但需要具备对目标目录和文件的写权限
def parallel_download_files(dn_list: typing.Iterable[typing.Sequence], parallel: int = 0):
"""
download files in parallel 多线程下载文件
用法示例: 2线程同时下载两个不同文件并保存到不同路径路径目录可未创建但需要具备对目标目录和文件的写权限
parallel_download_files([
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
])
'''
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
# parallel: 并行下载的线程池线程数为0则由函数自己决定
def parallel_download_files(dn_list, parallel: int = 0):
:dn_list: 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),) fullpath可以是str或Path
:parallel: 并行下载的线程池线程数为0则由函数自己决定
"""
mp_args = []
for url, fullpath in dn_list:
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
if url and isinstance(url, str) and url.startswith('http') \
and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
fullpath = Path(fullpath)
fullpath.parent.mkdir(parents=True, exist_ok=True)
mp_args.append((url, fullpath))
@@ -506,7 +556,11 @@ def parallel_download_files(dn_list, parallel: int = 0):
results = list(pool.map(download_one_file, mp_args))
return results
def delete_all_elements_in_list(string,lists):
def delete_all_elements_in_list(string: str, lists: typing.Iterable[str]):
"""
delete same string in given list
"""
new_lists = []
for i in lists:
if i != string:

View File

@@ -3,18 +3,19 @@ import json
import os
import re
import sys
import time
import shutil
import typing
import urllib3
import signal
import platform
import multiprocessing
from datetime import datetime, timedelta
from pathlib import Path
from opencc import OpenCC
import ADC_function
import config
from datetime import datetime, timedelta
import time
from pathlib import Path
from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number
from core import core_main, moveFailedFolder
@@ -44,7 +45,8 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
parser.add_argument("-p", "--path", default='', nargs='?', help="Analysis folder path.")
parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
parser.add_argument("-m", "--main-mode", default='', nargs='?',
help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
# parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
default_logdir = str(Path.home() / '.mlogs')
@@ -52,9 +54,12 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
default folder for current user: '{default_logdir}'. Change default folder to an empty file,
or use --log-dir= to turn log off.""")
parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.")
parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.")
parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.")
parser.add_argument("-q", "--regex-query", dest='regexstr', default='', nargs='?',
help="python re module regex filepath filtering.")
parser.add_argument("-d", "--nfo-skip-days", dest='days', default='', nargs='?',
help="Override nfo_skip_days value in config.")
parser.add_argument("-c", "--stop-counter", dest='cnt', default='', nargs='?',
help="Override stop_counter value in config.")
parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
parser.add_argument("-a", "--auto-exit", action="store_true",
@@ -67,12 +72,16 @@ is performed. It may help you correct wrong numbers before real job.""")
parser.add_argument("-v", "--version", action="version", version=ver)
args = parser.parse_args()
def get_natural_number_or_none(value):
return int(value) if isinstance(value, str) and value.isnumeric() and int(value) >= 0 else None
def get_str_or_none(value):
return value if isinstance(value, str) and len(value) else None
def get_bool_or_none(value):
return True if isinstance(value, bool) and value else None
config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode)
config.G_conf_override["common:source_folder"] = get_str_or_none(args.path)
config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit)
@@ -83,43 +92,53 @@ is performed. It may help you correct wrong numbers before real job.""")
return args.file, args.number, args.logdir, args.regexstr, args.zero_op
class OutLogger(object):
def __init__(self, logfile) -> None:
self.term = sys.stdout
self.log = open(logfile, "w", encoding='utf-8', buffering=1)
self.filepath = logfile
def __del__(self):
self.close()
def __enter__(self):
pass
def __exit__(self, *args):
self.close()
def write(self, msg):
self.term.write(msg)
self.log.write(msg)
def flush(self):
self.term.flush()
self.log.flush()
os.fsync(self.log.fileno())
def close(self):
if self.term != None:
if self.term is not None:
sys.stdout = self.term
self.term = None
if self.log != None:
if self.log is not None:
self.log.close()
self.log = None
class ErrLogger(OutLogger):
def __init__(self, logfile) -> None:
self.term = sys.stderr
self.log = open(logfile, "w", encoding='utf-8', buffering=1)
self.filepath = logfile
def close(self):
if self.term != None:
if self.term is not None:
sys.stderr = self.term
self.term = None
if self.log != None:
if self.log is not None:
self.log.close()
self.log = None
@@ -254,13 +273,14 @@ def signal_handler(*args):
print('[!]Ctrl+C detected, Exit.')
sys.exit(9)
def sigdebug_handler(*args):
config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"]
print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF'))
# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理提示跳过视频总数调试模式(-g)下详细被跳过文件,跳过小广告
def movie_lists(source_folder, regexstr):
def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
conf = config.getInstance()
main_mode = conf.main_mode()
debug = conf.debug()
@@ -311,11 +331,12 @@ def movie_lists(source_folder, regexstr):
continue # file is symlink or hardlink(Linux/NTFS/Darwin)
# 调试用0字节样本允许通过去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size直接赋0跳过小视频检测
if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120
if 0 < movie_size < 125829120: # 1024*1024*120=125829120
continue
if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
continue
if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days:
if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(
full_name.with_suffix('.nfo')) <= nfo_skip_days:
skip_nfo_days_cnt += 1
if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
@@ -325,7 +346,8 @@ def movie_lists(source_folder, regexstr):
if skip_failed_cnt:
print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.")
if skip_nfo_days_cnt:
print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
print(
f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
if nfo_skip_days <= 0 or not soft_link or main_mode == 3:
return total
# 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的
@@ -351,13 +373,17 @@ def movie_lists(source_folder, regexstr):
if debug:
print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'")
if len(rm_list):
print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.")
print(
f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.")
return total
def create_failed_folder(failed_folder):
if not os.path.exists(failed_folder): # 新建failed文件夹
def create_failed_folder(failed_folder: str):
"""
新建failed文件夹
"""
if not os.path.exists(failed_folder):
try:
os.makedirs(failed_folder)
except:
@@ -370,9 +396,7 @@ def rm_empty_folder(path):
deleted = set()
for current_dir, subdirs, files in os.walk(abspath, topdown=False):
try:
still_has_subdirs = any(
_ for subdir in subdirs if os.path.join(current_dir, subdir) not in deleted
)
still_has_subdirs = any(_ for subdir in subdirs if os.path.join(current_dir, subdir) not in deleted)
if not any(files) and not still_has_subdirs and not os.path.samefile(path, current_dir):
os.rmdir(current_dir)
deleted.add(current_dir)
@@ -387,7 +411,7 @@ def create_data_and_move(file_path: str, zero_op, oCC):
n_number = get_number(debug, os.path.basename(file_path))
file_path = os.path.abspath(file_path)
if debug == True:
if debug is True:
print(f"[!] [{n_number}] As Number making data for '{file_path}'")
if zero_op:
return
@@ -443,7 +467,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
def main():
version = '6.0.1'
version = '6.0.2'
urllib3.disable_warnings() # Ignore http proxy warning
# Read config.ini first, in argparse_function() need conf.failed_folder()
@@ -456,7 +480,7 @@ def main():
main_mode = conf.main_mode()
folder_path = ""
if not main_mode in (1, 2, 3):
if main_mode not in (1, 2, 3):
print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
sys.exit(4)
@@ -467,7 +491,8 @@ def main():
signal.signal(signal.SIGWINCH, sigdebug_handler)
dupe_stdout_to_logfile(logdir)
platform_total = str(' - ' + platform.platform() + ' \n[*] - ' + platform.machine() + ' - Python-' + platform.python_version())
platform_total = str(
' - ' + platform.platform() + ' \n[*] - ' + platform.machine() + ' - Python-' + platform.python_version())
print('[*]================= Movie Data Capture =================')
print('[*]' + version.center(54))
@@ -501,9 +526,13 @@ def main():
create_failed_folder(conf.failed_folder())
# Download Mapping Table, parallel version
def fmd(f):
def fmd(f) -> typing.Tuple[str, Path]:
"""
"""
return ('https://raw.githubusercontent.com/yoshiko2/Movie_Data_Capture/master/MappingTable/' + f,
Path.home() / '.local' / 'share' / 'mdc' / f)
map_tab = (fmd('mapping_actor.xml'), fmd('mapping_info.xml'), fmd('c_number.json'))
for k, v in map_tab:
if v.exists():
@@ -525,14 +554,15 @@ def main():
try:
oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json')
except:
# some OS no OpennCC cpython, try opencc-python-reimplemented.
# some OS no OpenCC cpython, try opencc-python-reimplemented.
# pip uninstall opencc && pip install opencc-python-reimplemented
oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t')
if not single_file_path == '': # Single File
print('[+]==================== Single File =====================')
if custom_number == '':
create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)), oCC)
create_data_and_move_with_custom_number(single_file_path,
get_number(conf.debug(), os.path.basename(single_file_path)), oCC)
else:
create_data_and_move_with_custom_number(single_file_path, custom_number, oCC)
else:
@@ -555,7 +585,8 @@ def main():
for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -',
time.strftime("%H:%M:%S")))
create_data_and_move(movie_path, zero_op, oCC)
if count >= stop_count:
print("[!]Stop counter triggered!")
@@ -581,7 +612,7 @@ def main():
sys.exit(0)
import multiprocessing
if __name__ == '__main__':
multiprocessing.freeze_support()
main()

View File

@@ -38,9 +38,10 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
return True
def get_data_from_json(file_number, oCC): # 从JSON返回元数据
def get_data_from_json(file_number, oCC):
"""
iterate through all services and fetch the data
iterate through all services and fetch the data 从JSON返回元数据
"""
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
@@ -67,7 +68,7 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 80:
if len(sources) <= len(func_mapping):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
@@ -235,8 +236,8 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
json_data['studio'] = studio
json_data['director'] = director
if conf.is_transalte():
translate_values = conf.transalte_values().split(",")
if conf.is_translate():
translate_values = conf.translate_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
@@ -248,12 +249,12 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
continue
except:
pass
if conf.get_transalte_engine() == "azure":
if conf.get_translate_engine() == "azure":
t = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_transalte_engine(),
key=conf.get_transalte_key(),
engine=conf.get_translate_engine(),
key=conf.get_translate_key(),
)
else:
t = translate(json_data[translate_value])
@@ -326,11 +327,13 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
naming_rule += json_data.get(i)
item = json_data.get(i)
naming_rule += item if type(item) is not list else "&".join(item)
json_data['naming_rule'] = naming_rule
return json_data
def special_characters_replacement(text) -> str:
if not isinstance(text, str):
return text

View File

@@ -8,7 +8,7 @@ success_output_folder=JAV_output
soft_link=0
failed_move=1
auto_exit=0
transalte_to_sc=0
translate_to_sc=0
multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female
@@ -51,7 +51,7 @@ folders=failed,JAV_output
switch=0
; 机器翻译
[transalte]
[translate]
switch=0
;可选项 google-free,azure
engine=google-free

View File

@@ -5,7 +5,6 @@ import configparser
import time
from pathlib import Path
G_conf_override = {
# index 0 save Config() first instance for quick access by using getInstance()
0: None,
@@ -98,14 +97,18 @@ class Config:
# print("[-]",e)
# sys.exit(3)
# #self.conf = self._default_config()
def getboolean_override(self, section, item) -> bool:
return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"])
return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(
G_conf_override[f"{section}:{item}"])
def getint_override(self, section, item) -> int:
return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"])
return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(
G_conf_override[f"{section}:{item}"])
def get_override(self, section, item) -> str:
return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"])
return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(
G_conf_override[f"{section}:{item}"])
def main_mode(self) -> int:
try:
@@ -127,34 +130,46 @@ class Config:
def soft_link(self) -> bool:
return self.conf.getboolean("common", "soft_link")
def failed_move(self) -> bool:
return self.conf.getboolean("common", "failed_move")
def auto_exit(self) -> bool:
return self.getboolean_override("common", "auto_exit")
def transalte_to_sc(self) -> bool:
return self.conf.getboolean("common", "transalte_to_sc")
def translate_to_sc(self) -> bool:
return self.conf.getboolean("common", "translate_to_sc")
def multi_threading(self) -> bool:
return self.conf.getboolean("common", "multi_threading")
def del_empty_folder(self) -> bool:
return self.conf.getboolean("common", "del_empty_folder")
def nfo_skip_days(self) -> int:
try:
return self.getint_override("common", "nfo_skip_days")
except:
return 30
def stop_counter(self) -> int:
try:
return self.getint_override("common", "stop_counter")
except:
return 0
def ignore_failed_list(self) -> bool:
return self.getboolean_override("common", "ignore_failed_list")
def download_only_missing_images(self) -> bool:
return self.conf.getboolean("common", "download_only_missing_images")
def mapping_table_validity(self) -> int:
return self.conf.getint("common", "mapping_table_validity")
def is_transalte(self) -> bool:
return self.conf.getboolean("transalte", "switch")
def is_translate(self) -> bool:
return self.conf.getboolean("translate", "switch")
def is_trailer(self) -> bool:
return self.conf.getboolean("trailer", "switch")
@@ -190,18 +205,25 @@ class Config:
return extrafanart_download
except ValueError:
self._exit("extrafanart_folder")
def get_transalte_engine(self) -> str:
return self.conf.get("transalte","engine")
# def get_transalte_appId(self) ->str:
# return self.conf.get("transalte","appid")
def get_transalte_key(self) -> str:
return self.conf.get("transalte","key")
def get_transalte_delay(self) -> int:
return self.conf.getint("transalte","delay")
def transalte_values(self) -> str:
return self.conf.get("transalte", "values")
def get_translate_engine(self) -> str:
return self.conf.get("translate", "engine")
# def get_translate_appId(self) ->str:
# return self.conf.get("translate","appid")
def get_translate_key(self) -> str:
return self.conf.get("translate", "key")
def get_translate_delay(self) -> int:
return self.conf.getint("translate", "delay")
def translate_values(self) -> str:
return self.conf.get("translate", "values")
def get_translate_service_site(self) -> str:
return self.conf.get("transalte", "service_site")
return self.conf.get("translate", "service_site")
def proxy(self):
try:
sec = "proxy"
@@ -320,7 +342,6 @@ class Config:
except:
return "hog"
@staticmethod
def _exit(sec: str) -> None:
print("[-] Read config error! Please check the {} section in config.ini", sec)
@@ -340,7 +361,7 @@ class Config:
conf.set(sec1, "soft_link", "0")
conf.set(sec1, "failed_move", "1")
conf.set(sec1, "auto_exit", "0")
conf.set(sec1, "transalte_to_sc", "1")
conf.set(sec1, "translate_to_sc", "1")
# actor_gender value: female or male or both or all(含人妖)
conf.set(sec1, "actor_gender", "female")
conf.set(sec1, "del_empty_folder", "1")
@@ -358,7 +379,6 @@ class Config:
conf.set(sec2, "type", "socks5")
conf.set(sec2, "cacert_file", "")
sec3 = "Name_Rule"
conf.add_section(sec3)
conf.set(sec3, "location_rule", "actor + '/' + number")
@@ -382,7 +402,7 @@ class Config:
conf.add_section(sec7)
conf.set(sec7, "switch", "0")
sec8 = "transalte"
sec8 = "translate"
conf.add_section(sec8)
conf.set(sec8, "switch", "0")
conf.set(sec8, "engine", "google-free")
@@ -402,8 +422,10 @@ class Config:
sec11 = "media"
conf.add_section(sec11)
conf.set(sec11, "media_type", ".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.MP4,.AVI,.RMVB,.WMV,.MOV,.MKV,.FLV,.TS,.WEBM,iso,ISO")
conf.set(sec11, "sub_type", ".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.txt,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml")
conf.set(sec11, "media_type",
".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.MP4,.AVI,.RMVB,.WMV,.MOV,.MKV,.FLV,.TS,.WEBM,iso,ISO")
conf.set(sec11, "sub_type",
".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.txt,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml")
sec12 = "watermark"
conf.add_section(sec12)
@@ -464,7 +486,8 @@ class IniProxy():
'''
if self.address:
if self.proxytype in self.SUPPORT_PROXY_TYPE:
proxies = {"http": self.proxytype + "://" + self.address, "https": self.proxytype + "://" + self.address}
proxies = {"http": self.proxytype + "://" + self.address,
"https": self.proxytype + "://" + self.address}
else:
proxies = {"http": "http://" + self.address, "https": "https://" + self.address}
else:
@@ -477,8 +500,11 @@ if __name__ == "__main__":
def evprint(evstr):
code = compile(evstr, "<string>", "eval")
print('{}: "{}"'.format(evstr, eval(code)))
config = Config()
mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'}
mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override',
'ini_path'}
for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
evprint(f'config.{_m}()')
pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}

26
core.py
View File

@@ -371,15 +371,19 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
moveFailedFolder(filepath)
return
# 此函数从gui版copy过来用用
# 参数说明
# poster_path
# thumb_path
# cn_sub 中文字幕 参数值为 1 0
# leak 流出 参数值为 1 0
# uncensored 无码 参数值为 1 0
# ========================================================================加水印
def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack):
def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack) -> None:
"""
add watermark on poster or thumb for describe extra properties 给海报和缩略图加属性水印
此函数从gui版copy过来用用
:poster_path 海报位置
:thumb_path 缩略图位置
:cn_sub: 中文字幕 可选值1,"1" 或其他值
:uncensored 无码 可选值1,"1" 或其他值
:hack 破解 可选值1,"1" 或其他值
"""
mark_type = ''
if cn_sub:
mark_type += ',字幕'
@@ -396,6 +400,7 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack):
add_mark_thread(poster_path, cn_sub, leak, uncensored, hack)
print('[+]Poster Add Mark: ' + mark_type.strip(','))
def add_mark_thread(pic_path, cn_sub, leak, uncensored, hack):
size = 9
img_pic = Image.open(pic_path)
@@ -414,6 +419,7 @@ def add_mark_thread(pic_path, cn_sub, leak, uncensored, hack):
add_to_pic(pic_path, img_pic, size, count, 4)
img_pic.close()
def add_to_pic(pic_path, img_pic, size, count, mode):
mark_pic_path = ''
pngpath = ''
@@ -455,6 +461,7 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
img_pic.save(pic_path, quality=95)
# ========================结束=================================
def paste_file_to_folder(filepath, path, number, leak_word, c_word, hack_word): # 文件路径,番号,后缀,要移动至的位置
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
@@ -546,6 +553,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print(f'[-]OS Error errno {oserr.errno}')
return
def get_part(filepath):
try:
if re.search('-CD\d+', filepath):

View File

@@ -2,6 +2,7 @@ import os
import re
import sys
import config
import typing
G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|"
@@ -9,30 +10,30 @@ G_spat = re.compile(
re.IGNORECASE)
def get_number(debug,file_path: str) -> str:
# """
# >>> from number_parser import get_number
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4")
# 'snis-829'
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829-C.mp4")
# 'snis-829'
# >>> get_number("C:¥Users¥Guest¥snis-829.mp4")
# 'snis-829'
# >>> get_number("C:¥Users¥Guest¥snis-829-C.mp4")
# 'snis-829'
# >>> get_number("./snis-829.mp4")
# 'snis-829'
# >>> get_number("./snis-829-C.mp4")
# 'snis-829'
# >>> get_number(".¥snis-829.mp4")
# 'snis-829'
# >>> get_number(".¥snis-829-C.mp4")
# 'snis-829'
# >>> get_number("snis-829.mp4")
# 'snis-829'
# >>> get_number("snis-829-C.mp4")
# 'snis-829'
# """
def get_number(debug: bool, file_path: str) -> str:
"""
从文件路径中提取番号 from number_parser import get_number
>>> get_number(False, "/Users/Guest/AV_Data_Capture/snis-829.mp4")
'snis-829'
>>> get_number(False, "/Users/Guest/AV_Data_Capture/snis-829-C.mp4")
'snis-829'
>>> get_number(False, "C:¥Users¥Guest¥snis-829.mp4")
'snis-829'
>>> get_number(False, "C:¥Users¥Guest¥snis-829-C.mp4")
'snis-829'
>>> get_number(False, "./snis-829.mp4")
'snis-829'
>>> get_number(False, "./snis-829-C.mp4")
'snis-829'
>>> get_number(False, ".¥snis-829.mp4")
'snis-829'
>>> get_number(False, ".¥snis-829-C.mp4")
'snis-829'
>>> get_number(False, "snis-829.mp4")
'snis-829'
>>> get_number(False, "snis-829-C.mp4")
'snis-829'
"""
filepath = os.path.basename(file_path)
# debug True 和 False 两块代码块合并原因是此模块及函数只涉及字符串计算没有IO操作debug on时输出导致异常信息即可
try:
@@ -79,7 +80,8 @@ G_TAKE_NUM_RULES = {
'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0]
}
def get_number_by_dict(filename: str) -> str:
def get_number_by_dict(filename: str) -> typing.Optional[str]:
try:
for k, v in G_TAKE_NUM_RULES.items():
if re.search(k, filename, re.I):
@@ -88,10 +90,13 @@ def get_number_by_dict(filename: str) -> str:
pass
return None
class Cache_uncensored_conf:
prefix = None
def is_empty(self):
return bool(self.prefix is None)
def set(self, v: list):
if not v or not len(v) or not len(v[0]):
raise ValueError('input prefix list empty or None')
@@ -100,13 +105,16 @@ class Cache_uncensored_conf:
for i in v[1:]:
s += f"|{i}.+"
self.prefix = re.compile(s, re.I)
def check(self, number):
if self.prefix is None:
raise ValueError('No init re compile')
return self.prefix.match(number)
G_cache_uncensored_conf = Cache_uncensored_conf()
# ========================================================================是否为无码
def is_uncensored(number):
if re.match(
@@ -119,6 +127,7 @@ r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydoug
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
return G_cache_uncensored_conf.check(number)
if __name__ == "__main__":
# import doctest
# doctest.testmod(raise_on_error=True)
@@ -145,9 +154,13 @@ if __name__ == "__main__":
"pacopacomama-093021_539-FHD.mkv", # 新支持片商格式 093021_539 命名规则来自javdb数据源
"sbw99.cc@heyzo_hd_2636_full.mp4"
)
def evprint(evstr):
code = compile(evstr, "<string>", "eval")
print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
for t in test_use_cases:
evprint(f'get_number(True, "{t}")')
@@ -170,6 +183,7 @@ if __name__ == "__main__":
# 示例:
# python3 ./number_parser.py ALL
import subprocess
ES_search_path = "ALL disks"
if sys.argv[1] == "ALL":
if sys.platform == "win32":
@@ -181,7 +195,8 @@ if __name__ == "__main__":
out_list = out_text.splitlines()
elif sys.platform in ("linux", "darwin"):
ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path)
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(
ES_prog_path)
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('utf-8')
out_list = [os.path.basename(line) for line in out_text.splitlines()]