diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0988bdc..34c2c61 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -39,10 +39,10 @@ jobs: run: | pyinstaller \ --onefile Movie_Data_Capture.py \ - --hidden-import ADC_function.py \ - --hidden-import core.py \ + --hidden-import "ImageProcessing.hog" \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ + --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \ --add-data "Img:Img" \ --add-data "config.ini:." \ @@ -51,10 +51,10 @@ jobs: run: | pyinstaller ` --onefile Movie_Data_Capture.py ` - --hidden-import ADC_function.py ` - --hidden-import core.py ` + --hidden-import "ImageProcessing.hog" ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" ` + --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1);face_recognition_models" ` --add-data "Img;Img" ` --add-data "config.ini;." ` diff --git a/ADC_function.py b/ADC_function.py index 1a64477..e310dd5 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,6 +1,6 @@ from os import replace import requests -#import hashlib +# import hashlib from pathlib import Path import secrets import os.path @@ -11,6 +11,7 @@ import time from lxml import etree import re import config +import typing from urllib.parse import urljoin import mechanicalsoup from requests.adapters import HTTPAdapter @@ -25,10 +26,13 @@ def getXpathSingle(htmlcode, xpath): return result1 -G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36' +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36' + -# 网页请求核心 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): + """ + 网页请求核心函数 + """ verify = config.getInstance().cacert_file() configProxy = config.getInstance().proxy() errors = "" @@ -39,7 +43,8 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, try: if configProxy.enable: proxies = configProxy.proxies() - result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies, verify=verify, + result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies, + verify=verify, cookies=cookies) else: result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies) @@ -89,7 +94,8 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: print("[-]" + errors) -G_DEFAULT_TIMEOUT = 10 # seconds +G_DEFAULT_TIMEOUT = 10 # seconds + class TimeoutHTTPAdapter(HTTPAdapter): def __init__(self, *args, **kwargs): @@ -98,6 +104,7 @@ class TimeoutHTTPAdapter(HTTPAdapter): self.timeout = kwargs["timeout"] del kwargs["timeout"] super().__init__(*args, **kwargs) + def send(self, request, **kwargs): timeout = kwargs.get("timeout") if timeout is None: @@ -106,12 +113,14 @@ class TimeoutHTTPAdapter(HTTPAdapter): # with keep-alive feature -def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): +def get_html_session(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None): configProxy = config.getInstance().proxy() session = requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(session.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: @@ -122,7 +131,7 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur try: if isinstance(url, str) and len(url): result = session.get(str(url)) - else: # 空url参数直接返回可重用session对象,无需设置return_type + else: # 空url参数直接返回可重用session对象,无需设置return_type return session if not result.ok: return None @@ -142,12 +151,14 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur return None -def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, use_scraper: bool = False): +def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None, use_scraper: bool = False): configProxy = config.getInstance().proxy() - s = create_scraper(browser={'custom': ua or G_USER_AGENT,}) if use_scraper else requests.Session() + s = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: @@ -178,12 +189,14 @@ def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, re return None -def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, + return_type: str = None, encoding: str = None): configProxy = config.getInstance().proxy() s = requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: @@ -216,12 +229,14 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: return None -def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): +def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, + encoding: str = None): configProxy = config.getInstance().proxy() - session = create_scraper(browser={'custom': ua or G_USER_AGENT,}) + session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(session.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504]) session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: @@ -230,7 +245,7 @@ def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, re try: if isinstance(url, str) and len(url): result = session.get(str(url)) - else: # 空url参数直接返回可重用scraper对象,无需设置return_type + else: # 空url参数直接返回可重用scraper对象,无需设置return_type return session if not result.ok: return None @@ -285,7 +300,12 @@ def translate( app_id: str = "", key: str = "", delay: int = 0, -): +) -> str: + """ + translate japanese kana to simplified chinese + 翻译日语假名到简体中文 + :raises ValueError: Non-existent translation engine + """ trans_result = "" # 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的 if not is_japanese(src): @@ -295,7 +315,7 @@ def translate( if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite): gsite = 'translate.google.cn' url = ( -f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={target_language}&q={src}" + f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={target_language}&q={src}" ) result = get_html(url=url, return_type="object") if not result.ok: @@ -324,26 +344,27 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t return trans_result -# 从浏览器中导出网站登录验证信息的cookies,能够以会员方式打开游客无法访问到的页面 -# 示例: FC2-755670 url https://javdb9.com/v/vO8Mn -# json 文件格式 -# 文件名: 站点名.json,示例 javdb9.json -# 内容(文件编码:UTF-8): -''' -{ - "over18":"1", - "redirect_to":"%2Fv%2FvO8Mn", - "remember_me_token":"cbJdeaFpbHMiOnsibWVzc2FnZSI6IklrNVJjbTAzZFVSRVlVaEtPWEpUVFhOVU0yNXhJZz09IiwiZXhwIjoiMjAyMS0wNS0xNVQxMzoyODoxNy4wMDBaIiwicHVyIjoiY29va2llLnJlbWVtYmVyX21lX3Rva2VuIn19--a7131611e844cf75f9db4cd411b635889bff3fe3", - "_jdb_session":"asddefqfwfwwrfdsdaAmqKj1%2FvOrDQP4b7h%2BvGp7brvIShi2Y%2FHBUr%2BklApk06TfhBOK3g5gRImZzoi49GINH%2FK49o3W%2FX64ugBiUAcudN9b27Mg6Ohu%2Bx9Z7A4bbqmqCt7XR%2Bao8PRuOjMcdDG5czoYHJCPIPZQFU28Gd7Awc2jc5FM5CoIgSRyaYDy9ulTO7DlavxoNL%2F6OFEL%2FyaA6XUYTB2Gs1kpPiUDqwi854mo5%2FrNxMhTeBK%2BjXciazMtN5KlE5JIOfiWAjNrnx7SV3Hj%2FqPNxRxXFQyEwHr5TZa0Vk1%2FjbwWQ0wcIFfh%2FMLwwqKydAh%2FLndc%2Bmdv3e%2FJ%2BiL2--xhqYnMyVRlxJajdN--u7nl0M7Oe7tZtPd4kIaEbg%3D%3D", - "locale":"zh", - "__cfduid":"dee27116d98c432a5cabc1fe0e7c2f3c91620479752", - "theme":"auto" -} -''' -# 从网站登录后,通过浏览器插件(CookieBro或EdittThisCookie)或者直接在地址栏网站链接信息处都可以复制或者导出cookie内容, -# 并填写到以上json文件的相应字段中 -def load_cookies(filename): - filename = os.path.basename(filename) +def load_cookies(cookie_json_filename: str): + """ + 加载cookie,用于以会员方式访问非游客内容 + + :filename: cookie文件名。获取cookie方式:从网站登录后,通过浏览器插件(CookieBro或EdittThisCookie)或者直接在地址栏网站链接信息处都可以复制或者导出cookie内容,以JSON方式保存 + + # 示例: FC2-755670 url https://javdb9.com/v/vO8Mn + # json 文件格式 + # 文件名: 站点名.json,示例 javdb9.json + # 内容(文件编码:UTF-8): + { + "over18":"1", + "redirect_to":"%2Fv%2FvO8Mn", + "remember_me_token":"***********", + "_jdb_session":"************", + "locale":"zh", + "__cfduid":"*********", + "theme":"auto" + } + """ + filename = os.path.basename(cookie_json_filename) if not len(filename): return None, None path_search_order = ( @@ -364,8 +385,11 @@ def load_cookies(filename): except: return None, None -# 文件修改时间距此时的天数 -def file_modification_days(filename) -> int: + +def file_modification_days(filename: str) -> int: + """ + 文件修改时间距此时的天数 + """ mfile = Path(filename) if not mfile.is_file(): return 9999 @@ -376,48 +400,61 @@ def file_modification_days(filename) -> int: return 9999 return days + def file_not_exist_or_empty(filepath) -> bool: return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0 -# 日语简单检测 -def is_japanese(s) -> bool: - return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE)) + +def is_japanese(raw: str) -> bool: + """ + 日语简单检测 + """ + return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE)) # Usage: python ./ADC_function.py https://cn.bing.com/ if __name__ == "__main__": import sys, timeit from http.client import HTTPConnection + + def benchmark(t, url): print(f"HTTP GET Benchmark times:{t} url:{url}") tm = timeit.timeit(f"_ = session1.get('{url}')", - "from __main__ import get_html_session;session1=get_html_session()", - number=t) + "from __main__ import get_html_session;session1=get_html_session()", + number=t) print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable') tm = timeit.timeit(f"_ = scraper1.get('{url}')", - "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", - number=t) + "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", + number=t) print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable') tm = timeit.timeit(f"_ = browser1.open('{url}')", - "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", - number=t) + "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", + number=t) print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable') tm = timeit.timeit(f"_ = get_html('{url}')", - "from __main__ import get_html", - number=t) + "from __main__ import get_html", + number=t) print(f' *{tm:>10.5f}s get_html()') + + t = 100 - #url = "https://www.189.cn/" + + # url = "https://www.189.cn/" url = "http://www.chinaunicom.com" HTTPConnection.debuglevel = 1 s = get_html_session() _ = s.get(url) HTTPConnection.debuglevel = 0 - if len(sys.argv)>1: + if len(sys.argv) > 1: url = sys.argv[1] benchmark(t, url) -def download_file_with_filename(url, filename, path): + +def download_file_with_filename(url: str, filename: str, path: str) -> None: + """ + download file save to give path with given name from given url + """ conf = config.getInstance() configProxy = conf.proxy() @@ -475,40 +512,66 @@ def download_file_with_filename(url, filename, path): raise ValueError('[-]Connect Failed! Please check your Proxy or Network!') return -def download_one_file(args): + +def download_one_file(args) -> str: + """ + download file save to given path from given url + wrapped for map function + """ + def _inner(url: str, save_path: Path): filebytes = get_html(url, return_type='content') if isinstance(filebytes, bytes) and len(filebytes): if len(filebytes) == save_path.open('wb').write(filebytes): return str(save_path) + return _inner(*args) -'''用法示例: 2线程同时下载两个不同文件,并保存到不同路径,路径目录可未创建,但需要具备对目标目录和文件的写权限 -parallel_download_files([ + +def parallel_download_files(dn_list: typing.Iterable[typing.Sequence], parallel: int = 0): + """ + download files in parallel 多线程下载文件 + + 用法示例: 2线程同时下载两个不同文件,并保存到不同路径,路径目录可未创建,但需要具备对目标目录和文件的写权限 + parallel_download_files([ ('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'), ('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml') ]) -''' -# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),) -# parallel: 并行下载的线程池线程数,为0则由函数自己决定 -def parallel_download_files(dn_list, parallel: int = 0): + + :dn_list: 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),) fullpath可以是str或Path + :parallel: 并行下载的线程池线程数,为0则由函数自己决定 + """ mp_args = [] for url, fullpath in dn_list: - if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)): + if url and isinstance(url, str) and url.startswith('http') \ + and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)): fullpath = Path(fullpath) fullpath.parent.mkdir(parents=True, exist_ok=True) mp_args.append((url, fullpath)) if not len(mp_args): return [] - if not isinstance(parallel, int) or parallel not in range(1,200): + if not isinstance(parallel, int) or parallel not in range(1, 200): parallel = min(5, len(mp_args)) with ThreadPoolExecutor(parallel) as pool: results = list(pool.map(download_one_file, mp_args)) return results -def delete_all_elements_in_list(string,lists): + +def delete_all_elements_in_list(string: str, lists: typing.Iterable[str]): + """ + delete same string in given list + """ new_lists = [] for i in lists: if i != string: new_lists.append(i) return new_lists + +def delete_all_elements_in_str(string_delete: str, string: str): + """ + delete same string in given list + """ + for i in string: + if i == string_delete: + string = string.replace(i,"") + return string \ No newline at end of file diff --git a/ImageProcessing/__init__.py b/ImageProcessing/__init__.py new file mode 100644 index 0000000..f545e91 --- /dev/null +++ b/ImageProcessing/__init__.py @@ -0,0 +1,95 @@ +import logging +import os +import config +import importlib +from PIL import Image +import shutil + + +def face_crop_width(filename, width, height): + # 新宽度是高度的2/3 + cropWidthHalf = int(height/3) + try: + locations_model = config.getInstance().face_locations_model().lower().split(',') + locations_model = filter(lambda x: x, locations_model) + for model in locations_model: + center, top = face_center(filename, model) + # 如果找到就跳出循环 + if center: + cropLeft = center-cropWidthHalf + cropRight = center+cropWidthHalf + # 越界处理 + if cropLeft < 0: + cropLeft = 0 + cropRight = cropWidthHalf*2 + elif cropRight > width: + cropLeft = width-cropWidthHalf*2 + cropRight = width + return (cropLeft, 0, cropRight, height) + except: + print('[-]Not found face! ' + filename) + # 默认靠右切 + return (width-cropWidthHalf*2, 0, width, height) + + +def face_crop_height(filename, width, height): + cropHeight = int(width*3/2) + try: + locations_model = config.getInstance().face_locations_model().lower().split(',') + locations_model = filter(lambda x: x, locations_model) + for model in locations_model: + center, top = face_center(filename, model) + # 如果找到就跳出循环 + if top: + # 头部靠上 + cropTop = top + cropBottom = cropHeight + top + if cropBottom > height: + cropTop = 0 + cropBottom = cropHeight + return (0, cropTop, width, cropBottom) + except: + print('[-]Not found face! ' + filename) + # 默认从顶部向下切割 + return (0, 0, width, cropHeight) + + +def cutImage(imagecut, path, fanart_path, poster_path): + fullpath_fanart = os.path.join(path, fanart_path) + fullpath_poster = os.path.join(path, poster_path) + if imagecut == 1: # 剪裁大封面 + try: + img = Image.open(fullpath_fanart) + width, height = img.size + if width/height > 2/3: # 如果宽度大于2 + # 以人像为中心切取 + img2 = img.crop(face_crop_width(fullpath_fanart, width, height)) + elif width/height < 2/3: # 如果高度大于3 + # 从底部向上切割 + img2 = img.crop(face_crop_height(fullpath_fanart, width, height)) + else: # 如果等于2/3 + img2 = img + img2.save(fullpath_poster) + print('[+]Image Cutted! ' + fullpath_poster) + except Exception as e: + print(e) + print('[-]Cover cut failed!') + elif imagecut == 0: # 复制封面 + shutil.copyfile(fullpath_fanart, fullpath_poster) + print('[+]Image Copyed! ' + fullpath_poster) + + +def face_center(filename, model): + print('[+]Use model ' + model) + try: + mod = importlib.import_module('.' + model, 'ImageProcessing') + return mod.face_center(filename, model) + except Exception as e: + print('[-]Model found face ' + filename) + if config.getInstance().debug() == 1: + logging.error(e) + return (0, 0) + +if __name__ == '__main__': + cutImage(1,'H:\\test\\','12.jpg','test.jpg') + \ No newline at end of file diff --git a/ImageProcessing/baidu.py b/ImageProcessing/baidu.py new file mode 100644 index 0000000..156bb8a --- /dev/null +++ b/ImageProcessing/baidu.py @@ -0,0 +1,25 @@ +from aip import AipBodyAnalysis +import config + + +def face_center(filename, model): + app_id = config.getInstance().conf.get("face", "appid") + api_key = config.getInstance().conf.get("face", "key") + app_secret = config.getInstance().conf.get("face", "secret") + client = AipBodyAnalysis(app_id, api_key, app_secret) + with open(filename, 'rb') as fp: + img = fp.read() + result = client.bodyAnalysis(img) + if 'error_code' in result: + raise ValueError(result['error_msg']) + print('[+]Found person ' + str(result['person_num'])) + # 中心点取鼻子x坐标 + maxRight = 0 + maxTop = 0 + for person_info in result["person_info"]: + x = int(person_info['body_parts']['nose']['x']) + top = int(person_info['location']['top']) + if x > maxRight: + maxRight = x + maxTop = top + return maxRight,maxTop diff --git a/ImageProcessing/cnn.py b/ImageProcessing/cnn.py new file mode 100644 index 0000000..4219c5d --- /dev/null +++ b/ImageProcessing/cnn.py @@ -0,0 +1,4 @@ +import hog + +def face_center(filename, model): + return hog.face_center(filename, model) \ No newline at end of file diff --git a/ImageProcessing/hog.py b/ImageProcessing/hog.py new file mode 100644 index 0000000..4e672b5 --- /dev/null +++ b/ImageProcessing/hog.py @@ -0,0 +1,17 @@ +import face_recognition + + +def face_center(filename, model): + image = face_recognition.load_image_file(filename) + face_locations = face_recognition.face_locations(image, 1, model) + print('[+]Found person ' + str(len(face_locations))) + maxRight = 0 + maxTop = 0 + for face_location in face_locations: + top, right, bottom, left = face_location + # 中心点 + x = int((right+left)/2) + if x > maxRight: + maxRight = x + maxTop = top + return maxRight,maxTop diff --git a/Movie_Data_Capture.py b/Movie_Data_Capture.py index a4f91b8..141a241 100644 --- a/Movie_Data_Capture.py +++ b/Movie_Data_Capture.py @@ -3,19 +3,20 @@ import json import os import re import sys +import time import shutil import typing import urllib3 import signal import platform +import multiprocessing +from datetime import datetime, timedelta +from pathlib import Path + from opencc import OpenCC -import ADC_function import config -from datetime import datetime, timedelta -import time -from pathlib import Path -from ADC_function import file_modification_days, get_html, parallel_download_files +from ADC_function import file_modification_days, get_html, parallel_download_files from number_parser import get_number from core import core_main, moveFailedFolder @@ -30,7 +31,7 @@ def check_update(local_version): time.sleep(60) os._exit(-1) data = json.loads(htmlcode) - remote = int(data["tag_name"].replace(".","")) + remote = int(data["tag_name"].replace(".", "")) local_version = int(local_version.replace(".", "")) if local_version < remote: print("[*]" + ("* New update " + str(data["tag_name"]) + " *").center(54)) @@ -43,36 +44,44 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]: conf = config.getInstance() parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.") parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") - parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.") - parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") + parser.add_argument("-p", "--path", default='', nargs='?', help="Analysis folder path.") + parser.add_argument("-m", "--main-mode", default='', nargs='?', + help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") default_logdir = str(Path.home() / '.mlogs') - parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?', - help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. + parser.add_argument("-o", "--log-dir", dest='logdir', default=default_logdir, nargs='?', + help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. default folder for current user: '{default_logdir}'. Change default folder to an empty file, or use --log-dir= to turn log off.""") - parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.") - parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.") - parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.") + parser.add_argument("-q", "--regex-query", dest='regexstr', default='', nargs='?', + help="python re module regex filepath filtering.") + parser.add_argument("-d", "--nfo-skip-days", dest='days', default='', nargs='?', + help="Override nfo_skip_days value in config.") + parser.add_argument("-c", "--stop-counter", dest='cnt', default='', nargs='?', + help="Override stop_counter value in config.") parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format( - os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) + os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) parser.add_argument("-a", "--auto-exit", action="store_true", help="Auto exit after program complete") - parser.add_argument("-g","--debug", action="store_true", + parser.add_argument("-g", "--debug", action="store_true", help="Turn on debug mode to generate diagnostic log for issue report.") - parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true", + parser.add_argument("-z", "--zero-operation", dest='zero_op', action="store_true", help="""Only show job list of files and numbers, and **NO** actual operation is performed. It may help you correct wrong numbers before real job.""") parser.add_argument("-v", "--version", action="version", version=ver) args = parser.parse_args() + def get_natural_number_or_none(value): - return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None + return int(value) if isinstance(value, str) and value.isnumeric() and int(value) >= 0 else None + def get_str_or_none(value): return value if isinstance(value, str) and len(value) else None + def get_bool_or_none(value): return True if isinstance(value, bool) and value else None + config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode) config.G_conf_override["common:source_folder"] = get_str_or_none(args.path) config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit) @@ -83,43 +92,53 @@ is performed. It may help you correct wrong numbers before real job.""") return args.file, args.number, args.logdir, args.regexstr, args.zero_op + class OutLogger(object): def __init__(self, logfile) -> None: self.term = sys.stdout - self.log = open(logfile,"w",encoding='utf-8',buffering=1) + self.log = open(logfile, "w", encoding='utf-8', buffering=1) self.filepath = logfile + def __del__(self): self.close() + def __enter__(self): pass + def __exit__(self, *args): self.close() - def write(self,msg): + + def write(self, msg): self.term.write(msg) self.log.write(msg) + def flush(self): self.term.flush() self.log.flush() os.fsync(self.log.fileno()) + def close(self): - if self.term != None: + if self.term is not None: sys.stdout = self.term self.term = None - if self.log != None: + if self.log is not None: self.log.close() self.log = None class ErrLogger(OutLogger): + def __init__(self, logfile) -> None: self.term = sys.stderr - self.log = open(logfile,"w",encoding='utf-8',buffering=1) + self.log = open(logfile, "w", encoding='utf-8', buffering=1) self.filepath = logfile + def close(self): - if self.term != None: + if self.term is not None: sys.stderr = self.term self.term = None - if self.log != None: + + if self.log is not None: self.log.close() self.log = None @@ -130,7 +149,7 @@ def dupe_stdout_to_logfile(logdir: str): log_dir = Path(logdir) if not log_dir.exists(): try: - log_dir.mkdir(parents=True,exist_ok=True) + log_dir.mkdir(parents=True, exist_ok=True) except: pass if not log_dir.is_dir(): @@ -147,7 +166,7 @@ def dupe_stdout_to_logfile(logdir: str): def close_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): return - #日志关闭前保存日志路径 + # 日志关闭前保存日志路径 filepath = None try: filepath = sys.stdout.filepath @@ -158,7 +177,7 @@ def close_logfile(logdir: str): log_dir = Path(logdir).resolve() if isinstance(filepath, Path): print(f"Log file '{filepath}' saved.") - assert(filepath.parent.samefile(log_dir)) + assert (filepath.parent.samefile(log_dir)) # 清理空文件 for f in log_dir.glob(r'*_err.txt'): if f.stat().st_size == 0: @@ -198,7 +217,7 @@ def close_logfile(logdir: str): cutday = len('T235959.txt') # cut length mdc_20201201|T235959.txt for f in day_merge: try: - day_file_name = str(f)[:-cutday] + '.txt' # mdc_20201201.txt + day_file_name = str(f)[:-cutday] + '.txt' # mdc_20201201.txt with open(day_file_name, 'a', encoding='utf-8') as m: m.write(f.read_text(encoding='utf-8')) f.unlink(missing_ok=True) @@ -210,7 +229,7 @@ def close_logfile(logdir: str): if not txts or not len(txts): break txts.sort() - tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32") + tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3 * 30)).strftime("%Y%m32") deadline_month = f'mdc_{tmstr_3_month_ago}' month_merge = [f for f in txts if f.stem < deadline_month] if not month_merge or not len(month_merge): @@ -218,7 +237,7 @@ def close_logfile(logdir: str): tomonth = len('01.txt') # cut length mdc_202012|01.txt for f in month_merge: try: - month_file_name = str(f)[:-tomonth] + '.txt' # mdc_202012.txt + month_file_name = str(f)[:-tomonth] + '.txt' # mdc_202012.txt with open(month_file_name, 'a', encoding='utf-8') as m: m.write(f.read_text(encoding='utf-8')) f.unlink(missing_ok=True) @@ -231,14 +250,14 @@ def close_logfile(logdir: str): if not mons or not len(mons): return mons.sort() - deadline_year = f'mdc_{today.year-1}13' + deadline_year = f'mdc_{today.year - 1}13' year_merge = [f for f in mons if f.stem < deadline_year] if not year_merge or not len(year_merge): return - toyear = len('12.txt') # cut length mdc_2020|12.txt + toyear = len('12.txt') # cut length mdc_2020|12.txt for f in year_merge: try: - year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt + year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt with open(year_file_name, 'a', encoding='utf-8') as y: y.write(f.read_text(encoding='utf-8')) f.unlink(missing_ok=True) @@ -254,13 +273,14 @@ def signal_handler(*args): print('[!]Ctrl+C detected, Exit.') sys.exit(9) + def sigdebug_handler(*args): config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"] print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF')) # 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告 -def movie_lists(source_folder, regexstr): +def movie_lists(source_folder, regexstr: str) -> typing.List[str]: conf = config.getInstance() main_mode = conf.main_mode() debug = conf.debug() @@ -280,9 +300,9 @@ def movie_lists(source_folder, regexstr): try: flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines() failed_set = set(flist) - if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的 + if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的 fset = failed_set.copy() - for i in range(len(flist)-1, -1, -1): + for i in range(len(flist) - 1, -1, -1): fset.remove(flist[i]) if flist[i] in fset else flist.pop(i) failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8') assert len(fset) == 0 and len(flist) == len(failed_set) @@ -308,14 +328,15 @@ def movie_lists(source_folder, regexstr): continue is_sym = full_name.is_symlink() if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标 - continue # file is symlink or hardlink(Linux/NTFS/Darwin) + continue # file is symlink or hardlink(Linux/NTFS/Darwin) # 调试用0字节样本允许通过,去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB) movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size,直接赋0跳过小视频检测 - if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120 + if 0 < movie_size < 125829120: # 1024*1024*120=125829120 continue if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name): continue - if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days: + if main_mode == 3 and nfo_skip_days > 0 and file_modification_days( + full_name.with_suffix('.nfo')) <= nfo_skip_days: skip_nfo_days_cnt += 1 if debug: print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'") @@ -325,7 +346,8 @@ def movie_lists(source_folder, regexstr): if skip_failed_cnt: print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.") if skip_nfo_days_cnt: - print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") + print( + f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") if nfo_skip_days <= 0 or not soft_link or main_mode == 3: return total # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数,跳过N天内更新过的 @@ -351,13 +373,17 @@ def movie_lists(source_folder, regexstr): if debug: print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'") if len(rm_list): - print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.") + print( + f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.") return total -def create_failed_folder(failed_folder): - if not os.path.exists(failed_folder): # 新建failed文件夹 +def create_failed_folder(failed_folder: str): + """ + 新建failed文件夹 + """ + if not os.path.exists(failed_folder): try: os.makedirs(failed_folder) except: @@ -370,9 +396,7 @@ def rm_empty_folder(path): deleted = set() for current_dir, subdirs, files in os.walk(abspath, topdown=False): try: - still_has_subdirs = any( - _ for subdir in subdirs if os.path.join(current_dir, subdir) not in deleted - ) + still_has_subdirs = any(_ for subdir in subdirs if os.path.join(current_dir, subdir) not in deleted) if not any(files) and not still_has_subdirs and not os.path.samefile(path, current_dir): os.rmdir(current_dir) deleted.add(current_dir) @@ -387,7 +411,7 @@ def create_data_and_move(file_path: str, zero_op, oCC): n_number = get_number(debug, os.path.basename(file_path)) file_path = os.path.abspath(file_path) - if debug == True: + if debug is True: print(f"[!] [{n_number}] As Number making data for '{file_path}'") if zero_op: return @@ -443,8 +467,8 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC): def main(): - version = '6.0.1' - urllib3.disable_warnings() #Ignore http proxy warning + version = '6.0.2' + urllib3.disable_warnings() # Ignore http proxy warning # Read config.ini first, in argparse_function() need conf.failed_folder() conf = config.Config("config.ini") @@ -455,7 +479,8 @@ def main(): main_mode = conf.main_mode() - if not main_mode in (1, 2, 3): + folder_path = "" + if main_mode not in (1, 2, 3): print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") sys.exit(4) @@ -466,7 +491,8 @@ def main(): signal.signal(signal.SIGWINCH, sigdebug_handler) dupe_stdout_to_logfile(logdir) - platform_total = str(' - ' + platform.platform() + ' \n[*] - ' + platform.machine() + ' - Python-' + platform.python_version()) + platform_total = str( + ' - ' + platform.platform() + ' \n[*] - ' + platform.machine() + ' - Python-' + platform.python_version()) print('[*]================= Movie Data Capture =================') print('[*]' + version.center(54)) @@ -484,15 +510,15 @@ def main(): print('[+]Enable debug') if conf.soft_link(): print('[!]Enable soft link') - if len(sys.argv)>1: - print('[!]CmdLine:'," ".join(sys.argv[1:])) + if len(sys.argv) > 1: + print('[!]CmdLine:', " ".join(sys.argv[1:])) print('[+]Main Working mode ## {}: {} ## {}{}{}' - .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1], - "" if not conf.multi_threading() else ", multi_threading on", - "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}", - "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}" - ) if not single_file_path else ('-','Single File', '','','')) - ) + .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode - 1], + "" if not conf.multi_threading() else ", multi_threading on", + "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}", + "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}" + ) if not single_file_path else ('-', 'Single File', '', '', '')) + ) if conf.update_check(): check_update(version) @@ -500,11 +526,15 @@ def main(): create_failed_folder(conf.failed_folder()) # Download Mapping Table, parallel version - def fmd(f): + def fmd(f) -> typing.Tuple[str, Path]: + """ + + """ return ('https://raw.githubusercontent.com/yoshiko2/Movie_Data_Capture/master/MappingTable/' + f, Path.home() / '.local' / 'share' / 'mdc' / f) + map_tab = (fmd('mapping_actor.xml'), fmd('mapping_info.xml'), fmd('c_number.json')) - for k,v in map_tab: + for k, v in map_tab: if v.exists(): if file_modification_days(str(v)) >= conf.mapping_table_validity(): print("[+]Mapping Table Out of date! Remove", str(v)) @@ -524,14 +554,15 @@ def main(): try: oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json') except: - # some OS no OpennCC cpython, try opencc-python-reimplemented. + # some OS no OpenCC cpython, try opencc-python-reimplemented. # pip uninstall opencc && pip install opencc-python-reimplemented oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t') - if not single_file_path == '': #Single File + if not single_file_path == '': # Single File print('[+]==================== Single File =====================') if custom_number == '': - create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)), oCC) + create_data_and_move_with_custom_number(single_file_path, + get_number(conf.debug(), os.path.basename(single_file_path)), oCC) else: create_data_and_move_with_custom_number(single_file_path, custom_number, oCC) else: @@ -546,7 +577,7 @@ def main(): print('[+]Find', count_all, 'movies.') print('[*]======================================================') stop_count = conf.stop_counter() - if stop_count<1: + if stop_count < 1: stop_count = 999999 else: count_all = str(min(len(movie_list), stop_count)) @@ -554,7 +585,8 @@ def main(): for movie_path in movie_list: # 遍历电影列表 交给core处理 count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' - print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) + print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', + time.strftime("%H:%M:%S"))) create_data_and_move(movie_path, zero_op, oCC) if count >= stop_count: print("[!]Stop counter triggered!") @@ -569,7 +601,7 @@ def main(): end_time = time.time() total_time = str(timedelta(seconds=end_time - start_time)) print("[+]Running time", total_time[:len(total_time) if total_time.rfind('.') < 0 else -3], - " End at", time.strftime("%Y-%m-%d %H:%M:%S")) + " End at", time.strftime("%Y-%m-%d %H:%M:%S")) print("[+]All finished!!!") @@ -580,7 +612,7 @@ def main(): sys.exit(0) -import multiprocessing + if __name__ == '__main__': multiprocessing.freeze_support() main() diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index c661700..d467eab 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -22,6 +22,8 @@ from . import xcity from . import dlsite from . import carib from . import fc2club +from . import mv91 +from . import madou def get_data_state(data: dict) -> bool: # 元数据获取失败检测 @@ -36,9 +38,10 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测 return True -def get_data_from_json(file_number, oCC): # 从JSON返回元数据 + +def get_data_from_json(file_number, oCC): """ - iterate through all services and fetch the data + iterate through all services and fetch the data 从JSON返回元数据 """ actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml')) @@ -57,13 +60,15 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据 # "javlib": javlib.main, "dlsite": dlsite.main, "carib": carib.main, - "fc2club": fc2club.main + "fc2club": fc2club.main, + "mv91": mv91.main, + "madou": madou.main } conf = config.getInstance() # default fetch order list, from the beginning to the end sources = conf.sources().split(',') - if not len(conf.sources()) > 80: + if len(sources) <= len(func_mapping): # if the input file name matches certain rules, # move some web service to the beginning of the list lo_file_number = file_number.lower() @@ -231,8 +236,8 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据 json_data['studio'] = studio json_data['director'] = director - if conf.is_transalte(): - translate_values = conf.transalte_values().split(",") + if conf.is_translate(): + translate_values = conf.translate_values().split(",") for translate_value in translate_values: if json_data[translate_value] == "": continue @@ -244,12 +249,12 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据 continue except: pass - if conf.get_transalte_engine() == "azure": + if conf.get_translate_engine() == "azure": t = translate( json_data[translate_value], target_language="zh-Hans", - engine=conf.get_transalte_engine(), - key=conf.get_transalte_key(), + engine=conf.get_translate_engine(), + key=conf.get_translate_key(), ) else: t = translate(json_data[translate_value]) @@ -270,7 +275,7 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据 if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0: return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0] else: - return vars + raise IndexError('keyword not found') for cc in cc_vars: if json_data[cc] == "" or len(json_data[cc]) == 0: continue @@ -298,20 +303,20 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据 json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) elif ccm == 3: json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc]) - json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) except: json_data[cc] = [oCC.convert(t) for t in json_data[cc]] else: try: if ccm == 1: json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc]) - json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) elif ccm == 2: json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc]) - json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) elif ccm == 3: json_data[cc] = convert(info_mapping_data, "jp", json_data[cc]) - json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) except IndexError: json_data[cc] = oCC.convert(json_data[cc]) except: @@ -322,11 +327,13 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据 if i not in json_data: naming_rule += i.strip("'").strip('"') else: - naming_rule += json_data.get(i) + item = json_data.get(i) + naming_rule += item if type(item) is not list else "&".join(item) json_data['naming_rule'] = naming_rule return json_data + def special_characters_replacement(text) -> str: if not isinstance(text, str): return text diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 3368430..d0eae1d 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -139,6 +139,7 @@ def getCover_small(html, index=0): def getTrailer(htmlcode): # 获取预告片 video_pather = re.compile(r'