import argparse import json import os import re import sys import shutil import typing import urllib3 import signal import platform from opencc import OpenCC import ADC_function import config from datetime import datetime, timedelta import time from pathlib import Path from ADC_function import file_modification_days, get_html, parallel_download_files from number_parser import get_number from core import core_main, moveFailedFolder def check_update(local_version): htmlcode = "" try: htmlcode = get_html("https://api.github.com/repos/yoshiko2/Movie_Data_Capture/releases/latest") except: print("===== Failed to connect to github =====") print("========== AUTO EXIT IN 60s ===========") time.sleep(60) os._exit(-1) data = json.loads(htmlcode) remote = int(data["tag_name"].replace(".","")) local_version = int(local_version.replace(".", "")) if local_version < remote: print("[*]" + ("* New update " + str(data["tag_name"]) + " *").center(54)) print("[*]" + "↓ Download ↓".center(54)) print("[*]https://github.com/yoshiko2/Movie_Data_Capture/releases") print("[*]======================================================") def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]: conf = config.getInstance() parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.") parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.") parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") default_logdir = str(Path.home() / '.mlogs') parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?', help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. default folder for current user: '{default_logdir}'. Change default folder to an empty file, or use --log-dir= to turn log off.""") parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.") parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.") parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.") parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format( os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) parser.add_argument("-a", "--auto-exit", action="store_true", help="Auto exit after program complete") parser.add_argument("-g","--debug", action="store_true", help="Turn on debug mode to generate diagnostic log for issue report.") parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true", help="""Only show job list of files and numbers, and **NO** actual operation is performed. It may help you correct wrong numbers before real job.""") parser.add_argument("-v", "--version", action="version", version=ver) args = parser.parse_args() def get_natural_number_or_none(value): return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None def get_str_or_none(value): return value if isinstance(value, str) and len(value) else None def get_bool_or_none(value): return True if isinstance(value, bool) and value else None config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode) config.G_conf_override["common:source_folder"] = get_str_or_none(args.path) config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit) config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days) config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt) config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list) config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug) return args.file, args.number, args.logdir, args.regexstr, args.zero_op class OutLogger(object): def __init__(self, logfile) -> None: self.term = sys.stdout self.log = open(logfile,"w",encoding='utf-8',buffering=1) self.filepath = logfile def __del__(self): self.close() def __enter__(self): pass def __exit__(self, *args): self.close() def write(self,msg): self.term.write(msg) self.log.write(msg) def flush(self): self.term.flush() self.log.flush() os.fsync(self.log.fileno()) def close(self): if self.term != None: sys.stdout = self.term self.term = None if self.log != None: self.log.close() self.log = None class ErrLogger(OutLogger): def __init__(self, logfile) -> None: self.term = sys.stderr self.log = open(logfile,"w",encoding='utf-8',buffering=1) self.filepath = logfile def close(self): if self.term != None: sys.stderr = self.term self.term = None if self.log != None: self.log.close() self.log = None def dupe_stdout_to_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0: return log_dir = Path(logdir) if not log_dir.exists(): try: log_dir.mkdir(parents=True,exist_ok=True) except: pass if not log_dir.is_dir(): return # Tips for disabling logs by change directory to a same name empty regular file abslog_dir = log_dir.resolve() log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") logfile = abslog_dir / f'mdc_{log_tmstr}.txt' errlog = abslog_dir / f'mdc_{log_tmstr}_err.txt' sys.stdout = OutLogger(logfile) sys.stderr = ErrLogger(errlog) def close_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): return #日志关闭前保存日志路径 filepath = None try: filepath = sys.stdout.filepath except: pass sys.stdout.close() sys.stderr.close() log_dir = Path(logdir).resolve() if isinstance(filepath, Path): print(f"Log file '{filepath}' saved.") assert(filepath.parent.samefile(log_dir)) # 清理空文件 for f in log_dir.glob(r'*_err.txt'): if f.stat().st_size == 0: try: f.unlink(missing_ok=True) except: pass # 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志, # 按月合并为单个月志,去年及以前的月志,今年4月以后将之按年合并为年志 # 测试步骤: """ LOGDIR=/tmp/mlog mkdir -p $LOGDIR for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/mdc_${f}T235959.txt;done for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/mdc_2021${f}T235959.txt;done for f in {00..23};do;echo 20211001T$f>$LOGDIR/mdc_20211001T${f}5959.txt;done echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR" # 1932 files in /tmp/mlog mdc -zgic1 -d0 -m3 -o $LOGDIR # python3 ./Movie_Data_Capture.py -zgic1 -o $LOGDIR ls $LOGDIR # rm -rf $LOGDIR """ today = datetime.today() # 第一步,合并到日。3天前的日志,文件名是同一天的合并为一份日志 for i in range(1): txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{8}T\d{6}$', f.stem, re.A)] if not txts or not len(txts): break e = [f for f in txts if '_err' in f.stem] txts.sort() tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99") deadline_day = f'mdc_{tmstr_3_days_ago}' day_merge = [f for f in txts if f.stem < deadline_day] if not day_merge or not len(day_merge): break cutday = len('T235959.txt') # cut length mdc_20201201|T235959.txt for f in day_merge: try: day_file_name = str(f)[:-cutday] + '.txt' # mdc_20201201.txt with open(day_file_name, 'a', encoding='utf-8') as m: m.write(f.read_text(encoding='utf-8')) f.unlink(missing_ok=True) except: pass # 第二步,合并到月 for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法 txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{8}$', f.stem, re.A)] if not txts or not len(txts): break txts.sort() tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32") deadline_month = f'mdc_{tmstr_3_month_ago}' month_merge = [f for f in txts if f.stem < deadline_month] if not month_merge or not len(month_merge): break tomonth = len('01.txt') # cut length mdc_202012|01.txt for f in month_merge: try: month_file_name = str(f)[:-tomonth] + '.txt' # mdc_202012.txt with open(month_file_name, 'a', encoding='utf-8') as m: m.write(f.read_text(encoding='utf-8')) f.unlink(missing_ok=True) except: pass # 第三步,月合并到年 if today.month < 4: return mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{6}$', f.stem, re.A)] if not mons or not len(mons): return mons.sort() deadline_year = f'mdc_{today.year-1}13' year_merge = [f for f in mons if f.stem < deadline_year] if not year_merge or not len(year_merge): return toyear = len('12.txt') # cut length mdc_2020|12.txt for f in year_merge: try: year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt with open(year_file_name, 'a', encoding='utf-8') as y: y.write(f.read_text(encoding='utf-8')) f.unlink(missing_ok=True) except: pass # 第四步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于 # 这种粒度的文本日志,压缩比是目前最好的。lzip -9的运行参数下,日志压缩比要高于xz -9,而且内存占用更少, # 多核利用率更高(plzip多线程版本),解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右, # 100MB的日志文件能缩小到3.7MB。 def signal_handler(*args): print('[!]Ctrl+C detected, Exit.') sys.exit(9) def sigdebug_handler(*args): config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"] print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF')) # 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告 def movie_lists(source_folder, regexstr): conf = config.getInstance() main_mode = conf.main_mode() debug = conf.debug() nfo_skip_days = conf.nfo_skip_days() soft_link = conf.soft_link() file_type = conf.media_type().lower().split(",") trailerRE = re.compile(r'-trailer\.', re.IGNORECASE) cliRE = None if isinstance(regexstr, str) and len(regexstr): try: cliRE = re.compile(regexstr, re.IGNORECASE) except: pass failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt' failed_set = set() if (main_mode == 3 or soft_link) and not conf.ignore_failed_list(): try: flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines() failed_set = set(flist) if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的 fset = failed_set.copy() for i in range(len(flist)-1, -1, -1): fset.remove(flist[i]) if flist[i] in fset else flist.pop(i) failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8') assert len(fset) == 0 and len(flist) == len(failed_set) except: pass if not Path(source_folder).is_dir(): print('[-]Source folder not found!') return [] total = [] source = Path(source_folder).resolve() skip_failed_cnt, skip_nfo_days_cnt = 0, 0 escape_folder_set = set(re.split("[,,]", conf.escape_folder())) for full_name in source.glob(r'**/*'): if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set: continue if not full_name.suffix.lower() in file_type: continue absf = str(full_name) if absf in failed_set: skip_failed_cnt += 1 if debug: print('[!]Skip failed movie:', absf) continue is_sym = full_name.is_symlink() if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标 continue # file is symlink or hardlink(Linux/NTFS/Darwin) # 调试用0字节样本允许通过,去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB) movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size,直接赋0跳过小视频检测 if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120 continue if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name): continue if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days: skip_nfo_days_cnt += 1 if debug: print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'") continue total.append(absf) if skip_failed_cnt: print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.") if skip_nfo_days_cnt: print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") if nfo_skip_days <= 0 or not soft_link or main_mode == 3: return total # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数,跳过N天内更新过的 skip_numbers = set() success_folder = Path(conf.success_folder()).resolve() for f in success_folder.glob(r'**/*'): if not re.match(r'\.nfo', f.suffix, re.IGNORECASE): continue if file_modification_days(f) > nfo_skip_days: continue number = get_number(False, f.stem) if not number: continue skip_numbers.add(number.lower()) rm_list = [] for f in total: n_number = get_number(False, os.path.basename(f)) if n_number and n_number.lower() in skip_numbers: rm_list.append(f) for f in rm_list: total.remove(f) if debug: print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'") if len(rm_list): print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.") return total def create_failed_folder(failed_folder): if not os.path.exists(failed_folder): # 新建failed文件夹 try: os.makedirs(failed_folder) except: print(f"[-]Fatal error! Can not make folder '{failed_folder}'") sys.exit(0) def rm_empty_folder(path): abspath = os.path.abspath(path) deleted = set() for current_dir, subdirs, files in os.walk(abspath, topdown=False): try: still_has_subdirs = any( _ for subdir in subdirs if os.path.join(current_dir, subdir) not in deleted ) if not any(files) and not still_has_subdirs and not os.path.samefile(path, current_dir): os.rmdir(current_dir) deleted.add(current_dir) print('[+]Deleting empty folder', current_dir) except: pass def create_data_and_move(file_path: str, zero_op, oCC): # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 debug = config.getInstance().debug() n_number = get_number(debug, os.path.basename(file_path)) file_path = os.path.abspath(file_path) if debug == True: print(f"[!] [{n_number}] As Number making data for '{file_path}'") if zero_op: return if n_number: core_main(file_path, n_number, oCC) else: print("[-] number empty ERROR") moveFailedFolder(file_path) print("[*]======================================================") else: try: print(f"[!] [{n_number}] As Number making data for '{file_path}'") if zero_op: return if n_number: core_main(file_path, n_number, oCC) else: raise ValueError("number empty") print("[*]======================================================") except Exception as err: print(f"[-] [{file_path}] ERROR:") print('[-]', err) try: moveFailedFolder(file_path) except Exception as err: print('[!]', err) def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC): conf = config.getInstance() file_name = os.path.basename(file_path) try: print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number)) if custom_number: core_main(file_path, custom_number, oCC) else: print("[-] number empty ERROR") print("[*]======================================================") except Exception as err: print("[-] [{}] ERROR:".format(file_path)) print('[-]', err) if conf.soft_link(): print("[-]Link {} to failed folder".format(file_path)) os.symlink(file_path, os.path.join(conf.failed_folder(), file_name)) else: try: print("[-]Move [{}] to failed folder".format(file_path)) shutil.move(file_path, os.path.join(conf.failed_folder(), file_name)) except Exception as err: print('[!]', err) def main(): version = '6.0.1' urllib3.disable_warnings() #Ignore http proxy warning # Read config.ini first, in argparse_function() need conf.failed_folder() conf = config.Config("config.ini") # Parse command line args single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version) main_mode = conf.main_mode() folder_path = "" if not main_mode in (1, 2, 3): print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") sys.exit(4) signal.signal(signal.SIGINT, signal_handler) if sys.platform == 'win32': signal.signal(signal.SIGBREAK, sigdebug_handler) else: signal.signal(signal.SIGWINCH, sigdebug_handler) dupe_stdout_to_logfile(logdir) platform_total = str(' - ' + platform.platform() + ' \n[*] - ' + platform.machine() + ' - Python-' + platform.python_version()) print('[*]================= Movie Data Capture =================') print('[*]' + version.center(54)) print('[*]======================================================') print('[*]' + platform_total) print('[*]======================================================') print('[*] - 严禁在墙内宣传本项目 - ') print('[*]======================================================') start_time = time.time() print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S")) print(f"[+]Load Config file '{conf.ini_path}'.") if conf.debug(): print('[+]Enable debug') if conf.soft_link(): print('[!]Enable soft link') if len(sys.argv)>1: print('[!]CmdLine:'," ".join(sys.argv[1:])) print('[+]Main Working mode ## {}: {} ## {}{}{}' .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1], "" if not conf.multi_threading() else ", multi_threading on", "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}", "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}" ) if not single_file_path else ('-','Single File', '','','')) ) if conf.update_check(): check_update(version) create_failed_folder(conf.failed_folder()) # Download Mapping Table, parallel version def fmd(f): return ('https://raw.githubusercontent.com/yoshiko2/Movie_Data_Capture/master/MappingTable/' + f, Path.home() / '.local' / 'share' / 'mdc' / f) map_tab = (fmd('mapping_actor.xml'), fmd('mapping_info.xml'), fmd('c_number.json')) for k,v in map_tab: if v.exists(): if file_modification_days(str(v)) >= conf.mapping_table_validity(): print("[+]Mapping Table Out of date! Remove", str(v)) os.remove(str(v)) res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists())) for i, fp in enumerate(res, start=1): if fp and len(fp): print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}") else: print(f"[-] [{i}/{len(res)}] Mapping Table Download failed") print("[-] --- AUTO EXIT AFTER 30s !!! --- ") time.sleep(30) os._exit(-1) # create OpenCC converter ccm = conf.cc_convert_mode() try: oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json') except: # some OS no OpennCC cpython, try opencc-python-reimplemented. # pip uninstall opencc && pip install opencc-python-reimplemented oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t') if not single_file_path == '': #Single File print('[+]==================== Single File =====================') if custom_number == '': create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)), oCC) else: create_data_and_move_with_custom_number(single_file_path, custom_number, oCC) else: folder_path = conf.source_folder() if not isinstance(folder_path, str) or folder_path == '': folder_path = os.path.abspath(".") movie_list = movie_lists(folder_path, regexstr) count = 0 count_all = str(len(movie_list)) print('[+]Find', count_all, 'movies.') print('[*]======================================================') stop_count = conf.stop_counter() if stop_count<1: stop_count = 999999 else: count_all = str(min(len(movie_list), stop_count)) for movie_path in movie_list: # 遍历电影列表 交给core处理 count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) create_data_and_move(movie_path, zero_op, oCC) if count >= stop_count: print("[!]Stop counter triggered!") break if conf.del_empty_folder() and not zero_op: rm_empty_folder(conf.success_folder()) rm_empty_folder(conf.failed_folder()) if len(folder_path): rm_empty_folder(folder_path) end_time = time.time() total_time = str(timedelta(seconds=end_time - start_time)) print("[+]Running time", total_time[:len(total_time) if total_time.rfind('.') < 0 else -3], " End at", time.strftime("%Y-%m-%d %H:%M:%S")) print("[+]All finished!!!") close_logfile(logdir) if not conf.auto_exit(): input("Press enter key exit, you can check the error message before you exit...") sys.exit(0) import multiprocessing if __name__ == '__main__': multiprocessing.freeze_support() main()