diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6007c75..b071b16 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,6 +19,16 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install UPX + uses: crazy-max/ghaction-upx@v2 + if: matrix.os == 'windows-latest' || matrix.os == 'ubuntu-latest' + with: + install-only: true + + - name: UPX version + if: matrix.os == 'windows-latest' || matrix.os == 'ubuntu-latest' + run: upx --version + - name: Setup Python 3.10 uses: actions/setup-python@v2 with: @@ -28,6 +38,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt + pip install face_recognition --no-deps pip install pyinstaller - name: Test number_perser.get_number @@ -39,11 +50,11 @@ jobs: run: | pyinstaller \ --onefile Movie_Data_Capture.py \ - --python-option u \ - --hidden-import "ImageProcessing.cnn" \ - --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ - --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ - --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \ + --collect-submodules "scrapinglib" \ + --collect-submodules "ImageProcessing" \ + --collect-data "face_recognition_models" \ + --collect-data "cloudscraper" \ + --collect-data "opencc" \ --add-data "Img:Img" \ --add-data "config.ini:." \ @@ -52,11 +63,11 @@ jobs: run: | pyinstaller ` --onefile Movie_Data_Capture.py ` - --python-option u ` - --hidden-import "ImageProcessing.cnn" ` - --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` - --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" ` - --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1);face_recognition_models" ` + --collect-submodules "scrapinglib" ` + --collect-submodules "ImageProcessing" ` + --collect-data "face_recognition_models" ` + --collect-data "cloudscraper" ` + --collect-data "opencc" ` --add-data "Img;Img" ` --add-data "config.ini;." ` @@ -77,5 +88,5 @@ jobs: - name: Upload build artifact uses: actions/upload-artifact@v1 with: - name: Movie_Data_Capture-CLI-${{ env.VERSION }}-${{ runner.os }}-amd64 + name: MDC-${{ env.VERSION }}-${{ runner.os }}-amd64 path: dist diff --git a/ADC_function.py b/ADC_function.py index 4fcbec1..5b099ef 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -301,7 +301,7 @@ def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, r def translate( src: str, - target_language: str = "zh_cn", + target_language: str = config.getInstance().get_target_language(), engine: str = "google-free", app_id: str = "", key: str = "", @@ -342,7 +342,15 @@ def translate( result = post_html(url=url, query=body, headers=headers) translate_list = [i["text"] for i in result.json()[0]["translations"]] trans_result = trans_result.join(translate_list) - + elif engine == "deeplx": + url = config.getInstance().get_translate_service_site() + res = requests.post(f"{url}/translate", json={ + 'text': src, + 'source_lang': 'auto', + 'target_lang': target_language, + }) + if res.text.strip(): + trans_result = res.json().get('data') else: raise ValueError("Non-existent translation engine") diff --git a/ImageProcessing/__init__.py b/ImageProcessing/__init__.py index 4df1797..768b1b4 100644 --- a/ImageProcessing/__init__.py +++ b/ImageProcessing/__init__.py @@ -60,9 +60,9 @@ def face_crop_height(filename, width, height): return (0, 0, width, cropHeight) -def cutImage(imagecut, path, fanart_path, poster_path, skip_facerec=False): +def cutImage(imagecut, path, thumb_path, poster_path, skip_facerec=False): conf = config.getInstance() - fullpath_fanart = os.path.join(path, fanart_path) + fullpath_fanart = os.path.join(path, thumb_path) fullpath_poster = os.path.join(path, poster_path) aspect_ratio = conf.face_aspect_ratio() if conf.face_aways_imagecut(): diff --git a/Movie_Data_Capture.py b/Movie_Data_Capture.py index 46c7695..91d8bdb 100644 --- a/Movie_Data_Capture.py +++ b/Movie_Data_Capture.py @@ -104,9 +104,9 @@ is performed. It may help you correct wrong numbers before real job.""") set_str_or_none("common:source_folder", args.path) set_bool_or_none("common:auto_exit", args.auto_exit) set_natural_number_or_none("common:nfo_skip_days", args.days) - set_natural_number_or_none("common:stop_counter", args.cnt) + set_natural_number_or_none("advenced_sleep:stop_counter", args.cnt) set_bool_or_none("common:ignore_failed_list", args.ignore_failed_list) - set_str_or_none("common:rerun_delay", args.delaytm) + set_str_or_none("advenced_sleep:rerun_delay", args.delaytm) set_str_or_none("priority:website", args.site) if isinstance(args.dnimg, bool) and args.dnimg: conf.set_override("common:download_only_missing_images=0") @@ -119,7 +119,7 @@ is performed. It may help you correct wrong numbers before real job.""") if conf.main_mode() == 3: no_net_op = args.no_network_operation if no_net_op: - conf.set_override("common:stop_counter=0;rerun_delay=0s;face:aways_imagecut=1") + conf.set_override("advenced_sleep:stop_counter=0;advenced_sleep:rerun_delay=0s;face:aways_imagecut=1") return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op, args.specified_source, args.specified_url @@ -681,7 +681,7 @@ def period(delta, pattern): if __name__ == '__main__': - version = '6.4.1' + version = '6.6.2' urllib3.disable_warnings() # Ignore http proxy warning app_start = time.time() diff --git a/README.md b/README.md index d4a9c42..a646ed5 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,12 @@ # 申明 当你查阅、下载了本项目源代码或二进制程序,即代表你接受了以下条款 * 本项目和项目成果仅供技术,学术交流和Python3性能测试使用 +* 用户必须确保获取影片的途径在用户当地是合法的 +* 运行时和运行后所获取的元数据和封面图片等数据的版权,归版权持有人持有 * 本项目贡献者编写该项目旨在学习Python3 ,提高编程水平 * 本项目不提供任何影片下载的线索 +* 请勿提供运行时和运行后获取的数据提供给可能有非法目的的第三方,例如用于非法交易、侵犯未成年人的权利等 +* 用户仅能在自己的私人计算机或者测试环境中使用该工具,禁止将获取到的数据用于商业目的或其他目的,如销售、传播等 * 用户在使用本项目和项目成果前,请用户了解并遵守当地法律法规,如果本项目及项目成果使用过程中存在违反当地法律法规的行为,请勿使用该项目及项目成果 * 法律后果及使用后果由使用者承担 * [GPL LICENSE](https://github.com/yoshiko2/Movie_Data_Capture/blob/master/LICENSE) @@ -36,3 +40,6 @@ # 贡献者 [![](https://opencollective.com/movie_data_capture/contributors.svg?width=890)](https://github.com/yoshiko2/movie_data_Capture/graphs/contributors) + +# Star History +[![Star History Chart](https://api.star-history.com/svg?repos=yoshiko2/Movie_Data_Capture&type=Date)](https://star-history.com/#yoshiko2/Movie_Data_Capture&Date) diff --git a/README_ZH.md b/README_ZH.md index 269e833..686945d 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -32,3 +32,7 @@ # 贡献者 [![](https://opencollective.com/movie_data_capture/contributors.svg?width=890)](https://github.com/yoshiko2/movie_data_Capture/graphs/contributors) + +# Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=yoshiko2/Movie_Data_Capture&type=Date)](https://star-history.com/#yoshiko2/Movie_Data_Capture&Date) diff --git a/config.ini b/config.ini index bd6c522..18ffc87 100755 --- a/config.ini +++ b/config.ini @@ -21,11 +21,14 @@ nfo_skip_days = 30 ignore_failed_list = 0 download_only_missing_images = 1 mapping_table_validity = 7 -; 在jellyfin中tags和genres重复,因此可以只保存genres到nfo中 -donot_save_tags = 0 +; 一些jellyfin中特有的设置 (0:不开启, 1:开启) 比如 +; 在jellyfin中tags和genres重复,因此可以只需保存genres到nfo中 +; jellyfin中只需要保存thumb,不需要保存fanart +jellyfin = 0 ; 开启后tag和genere只显示演员 actor_only_tag = 0 sleep = 3 +anonymous_fill = 1 [advenced_sleep] ; 处理完多少个视频文件后停止,0为处理所有视频文件 @@ -48,13 +51,14 @@ cacert_file = location_rule = actor+'/'+number naming_rule = number+'-'+title max_title_len = 50 -image_naming_with_number = 1 +; 刮削后图片是否命名为番号 +image_naming_with_number = 0 [update] update_check = 1 [priority] -website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle +website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,getchu,javdb,gcolle,javday,javmenu [escape] literals = \()/ @@ -66,13 +70,15 @@ switch = 0 ; 机器翻译 [translate] switch = 0 -;可选项 google-free,azure +;可选项 google-free,azure,deeplx engine = google-free -; azure翻译密钥 +target_language = zh_cn +; azure翻译密钥key key = ; 翻译延迟 delay = 1 values = title,outline +; google翻译服务站点,或deeplx访问链接 service_site = translate.google.cn ; 预告片 diff --git a/config.py b/config.py index 5232713..897df66 100644 --- a/config.py +++ b/config.py @@ -169,13 +169,13 @@ class Config: self._exit("common:main_mode") def source_folder(self) -> str: - return self.conf.get("common", "source_folder") + return self.conf.get("common", "source_folder").replace("\\\\", "/").replace("\\", "/") def failed_folder(self) -> str: - return self.conf.get("common", "failed_output_folder") + return self.conf.get("common", "failed_output_folder").replace("\\\\", "/").replace("\\", "/") def success_folder(self) -> str: - return self.conf.get("common", "success_output_folder") + return self.conf.get("common", "success_output_folder").replace("\\\\", "/").replace("\\", "/") def actor_gender(self) -> str: return self.conf.get("common", "actor_gender") @@ -213,8 +213,8 @@ class Config: def mapping_table_validity(self) -> int: return self.conf.getint("common", "mapping_table_validity") - def donot_save_tags(self) -> int: - return self.conf.getint("common", "donot_save_tags") + def jellyfin(self) -> int: + return self.conf.getint("common", "jellyfin") def actor_only_tag(self) -> bool: return self.conf.getboolean("common", "actor_only_tag") @@ -222,13 +222,16 @@ class Config: def sleep(self) -> int: return self.conf.getint("common", "sleep") + def anonymous_fill(self) -> bool: + return self.conf.getint("common", "anonymous_fill") + def stop_counter(self) -> int: return self.conf.getint("advenced_sleep", "stop_counter", fallback=0) def rerun_delay(self) -> int: value = self.conf.get("advenced_sleep", "rerun_delay") if not (isinstance(value, str) and re.match(r'^[\dsmh]+$', value, re.I)): - return 0 # not match '1h30m45s' or '30' or '1s2m1h4s5m' + return 0 # not match '1h30m45s' or '30' or '1s2m1h4s5m' if value.isnumeric() and int(value) >= 0: return int(value) sec = 0 @@ -279,6 +282,9 @@ class Config: def get_translate_engine(self) -> str: return self.conf.get("translate", "engine") + def get_target_language(self) -> str: + return self.conf.get("translate", "target_language") + # def get_translate_appId(self) ->str: # return self.conf.get("translate","appid") @@ -439,16 +445,19 @@ class Config: # actor_gender value: female or male or both or all(含人妖) conf.set(sec1, "actor_gender", "female") conf.set(sec1, "del_empty_folder", "1") - conf.set(sec1, "nfo_skip_days", 30) - conf.set(sec1, "ignore_failed_list", 0) - conf.set(sec1, "download_only_missing_images", 1) - conf.set(sec1, "mapping_table_validity", 7) - conf.set(sec1, "donot_save_tags", 0) + conf.set(sec1, "nfo_skip_days", "30") + conf.set(sec1, "ignore_failed_list", "0") + conf.set(sec1, "download_only_missing_images", "1") + conf.set(sec1, "mapping_table_validity", "7") + conf.set(sec1, "jellyfin", "0") + conf.set(sec1, "actor_only_tag", "0") + conf.set(sec1, "sleep", "3") + conf.set(sec1, "anonymous_fill", "0") sec2 = "advenced_sleep" conf.add_section(sec2) - conf.set(sec2, "stop_counter", 0) - conf.set(sec2, "rerun_delay", 0) + conf.set(sec2, "stop_counter", "0") + conf.set(sec2, "rerun_delay", "0") sec3 = "proxy" conf.add_section(sec3) @@ -463,6 +472,7 @@ class Config: conf.set(sec4, "location_rule", "actor + '/' + number") conf.set(sec4, "naming_rule", "number + '-' + title") conf.set(sec4, "max_title_len", "50") + conf.set(sec4, "image_naming_with_number", "0") sec5 = "update" conf.add_section(sec5) @@ -485,6 +495,7 @@ class Config: conf.add_section(sec9) conf.set(sec9, "switch", "0") conf.set(sec9, "engine", "google-free") + conf.set(sec9, "target_language", "zh_cn") # conf.set(sec8, "appid", "") conf.set(sec9, "key", "") conf.set(sec9, "delay", "1") @@ -508,28 +519,28 @@ class Config: sec13 = "watermark" conf.add_section(sec13) - conf.set(sec13, "switch", 1) - conf.set(sec13, "water", 2) + conf.set(sec13, "switch", "1") + conf.set(sec13, "water", "2") sec14 = "extrafanart" conf.add_section(sec14) - conf.set(sec14, "switch", 1) + conf.set(sec14, "switch", "1") conf.set(sec14, "extrafanart_folder", "extrafanart") - conf.set(sec14, "parallel_download", 1) + conf.set(sec14, "parallel_download", "1") sec15 = "storyline" conf.add_section(sec15) - conf.set(sec15, "switch", 1) + conf.set(sec15, "switch", "1") conf.set(sec15, "site", "1:avno1,4:airavwiki") conf.set(sec15, "censored_site", "2:airav,5:xcity,6:amazon") conf.set(sec15, "uncensored_site", "3:58avgo") - conf.set(sec15, "show_result", 0) - conf.set(sec15, "run_mode", 1) - conf.set(sec15, "cc_convert", 1) + conf.set(sec15, "show_result", "0") + conf.set(sec15, "run_mode", "1") + conf.set(sec15, "cc_convert", "1") sec16 = "cc_convert" conf.add_section(sec16) - conf.set(sec16, "mode", 1) + conf.set(sec16, "mode", "1") conf.set(sec16, "vars", "actor,director,label,outline,series,studio,tag,title") sec17 = "javdb" diff --git a/core.py b/core.py index 9f581d3..dba98da 100644 --- a/core.py +++ b/core.py @@ -272,22 +272,25 @@ def extrafanart_download_threadpool(url_list, save_dir, number, json_data=None): def image_ext(url): try: - return os.path.splitext(url)[-1] + ext = os.path.splitext(url)[-1] + if ext in {'.jpg','.jpge','.bmp','.png','.gif'}: + return ext + return ".jpg" except: return ".jpg" # 封面是否下载成功,否则移动到failed def image_download(cover, fanart_path, thumb_path, path, filepath, json_headers=None): - full_filepath = os.path.join(path, fanart_path) + full_filepath = os.path.join(path, thumb_path) if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath): return if json_headers != None: - if download_file_with_filename(cover, fanart_path, path, filepath, json_headers['headers']) == 'failed': + if download_file_with_filename(cover, thumb_path, path, filepath, json_headers['headers']) == 'failed': moveFailedFolder(filepath) return else: - if download_file_with_filename(cover, fanart_path, path, filepath) == 'failed': + if download_file_with_filename(cover, thumb_path, path, filepath) == 'failed': moveFailedFolder(filepath) return @@ -296,20 +299,21 @@ def image_download(cover, fanart_path, thumb_path, path, filepath, json_headers= if file_not_exist_or_empty(full_filepath): print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) if json_headers != None: - download_file_with_filename(cover, fanart_path, path, filepath, json_headers['headers']) + download_file_with_filename(cover, thumb_path, path, filepath, json_headers['headers']) else: - download_file_with_filename(cover, fanart_path, path, filepath) + download_file_with_filename(cover, thumb_path, path, filepath) continue else: break if file_not_exist_or_empty(full_filepath): return print('[+]Image Downloaded!', Path(full_filepath).name) - shutil.copyfile(full_filepath, os.path.join(path, thumb_path)) + if not config.getInstance().jellyfin(): + shutil.copyfile(full_filepath, os.path.join(path, fanart_path)) def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, - uncensored, hack_word, _4k, fanart_path, poster_path, thumb_path): + uncensored, hack, hack_word, _4k, fanart_path, poster_path, thumb_path): title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info( json_data) if config.getInstance().main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 @@ -332,13 +336,23 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f pass # KODI内查看影片信息时找不到number,配置naming_rule=number+'#'+title虽可解决 # 但使得标题太长,放入时常为空的outline内会更适合,软件给outline留出的显示版面也较大 - outline = f"{number}#{outline}" + if not outline: + pass + elif json_data['source'] == 'pissplay': + outline = f"{outline}" + else: + outline = f"{number}#{outline}" with open(nfo_path, "wt", encoding='UTF-8') as code: print('', file=code) print("", file=code) - print(" <![CDATA[" + naming_rule + "]]>", file=code) - print(" ", file=code) - print(" ", file=code) + if not config.getInstance().jellyfin(): + print(" <![CDATA[" + naming_rule + "]]>", file=code) + print(" ", file=code) + print(" ", file=code) + else: + print(" " + naming_rule + "", file=code) + print(" " + json_data['original_naming_rule'] + "", file=code) + print(" " + naming_rule + "", file=code) print(" JP-18+", file=code) print(" JP-18+", file=code) try: @@ -347,13 +361,18 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f print(" ", file=code) print(" " + studio + "", file=code) print(" " + year + "", file=code) - print(" ", file=code) - print(" ", file=code) + if not config.getInstance().jellyfin(): + print(" ", file=code) + print(" ", file=code) + else: + print(" " + outline + "", file=code) + print(" " + outline + "", file=code) print(" " + str(runtime).replace(" ", "") + "", file=code) print(" " + director + "", file=code) print(" " + poster_path + "", file=code) print(" " + thumb_path + "", file=code) - print(" " + fanart_path + "", file=code) + if not config.getInstance().jellyfin(): # jellyfin 不需要保存fanart + print(" " + fanart_path + "", file=code) try: for key in actor_list: print(" ", file=code) @@ -368,8 +387,8 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f print(" " + studio + "", file=code) print(" ", file=code) - skip_tags = config.getInstance().donot_save_tags() - if not skip_tags: + jellyfin = config.getInstance().jellyfin() + if not jellyfin: if config.getInstance().actor_only_tag(): for key in actor_list: try: @@ -377,27 +396,27 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f except: pass else: - if cn_sub == '1': + if cn_sub: print(" 中文字幕", file=code) - if liuchu == '流出': + if liuchu: print(" 流出", file=code) - if uncensored == 1: + if uncensored: print(" 无码", file=code) - if hack_word != '': + if hack: print(" 破解", file=code) - if _4k == '1': + if _4k: print(" 4k", file=code) for i in tag: print(" " + i + "", file=code) - if cn_sub == '1': + if cn_sub: print(" 中文字幕", file=code) - if liuchu == '流出': + if liuchu: print(" 无码流出", file=code) - if uncensored == 1: + if uncensored: print(" 无码", file=code) - if hack_word != '': + if hack: print(" 破解", file=code) - if _4k == '1': + if _4k: print(" 4k", file=code) try: for i in tag: @@ -470,6 +489,7 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack, _4k) -> No :cn_sub: 中文字幕 可选值:1,"1" 或其他值 :uncensored 无码 可选值:1,"1" 或其他值 :hack 破解 可选值:1,"1" 或其他值 + :_4k Bool """ mark_type = '' if cn_sub: @@ -495,17 +515,17 @@ def add_mark_thread(pic_path, cn_sub, leak, uncensored, hack, _4k): # 获取自定义位置,取余配合pos达到顺时针添加的效果 # 左上 0, 右上 1, 右下 2, 左下 3 count = config.getInstance().watermark_type() - if cn_sub == 1 or cn_sub == '1': + if cn_sub: add_to_pic(pic_path, img_pic, size, count, 1) # 添加 count = (count + 1) % 4 - if leak == 1 or leak == '1': + if leak: add_to_pic(pic_path, img_pic, size, count, 2) count = (count + 1) % 4 - if uncensored == 1 or uncensored == '1': + if uncensored: add_to_pic(pic_path, img_pic, size, count, 3) - if hack == 1 or hack == '1': + if hack: add_to_pic(pic_path, img_pic, size, count, 4) - if _4k == 1 or _4k == '1': + if _4k: add_to_pic(pic_path, img_pic, size, count, 5) img_pic.close() @@ -613,6 +633,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo create_softlink = False if link_mode not in (1, 2): shutil.move(filepath, targetpath) + print("[!]Move => ", path) + return elif link_mode == 2: try: os.link(filepath, targetpath, follow_symlinks=False) @@ -624,16 +646,13 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo os.symlink(filerelpath, targetpath) except: os.symlink(str(filepath_obj.resolve()), targetpath) - return + print("[!]Link => ", path) except FileExistsError as fee: print(f'[-]FileExistsError: {fee}') - return except PermissionError: print('[-]Error! Please run as administrator!') - return except OSError as oserr: print(f'[-]OS Error errno {oserr.errno}') - return def linkImage(path, number, part, leak_word, c_word, hack_word, ext): @@ -693,12 +712,12 @@ def core_main_no_net_op(movie_path, number): conf = config.getInstance() part = '' leak_word = '' - leak = 0 + leak = False c_word = '' - cn_sub = '' - hack = '' + cn_sub = False + hack = False hack_word = '' - _4k = '' + _4k = False imagecut = 1 multi = False part = '' @@ -709,30 +728,30 @@ def core_main_no_net_op(movie_path, number): multi = True if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path, re.I) or '中文' in movie_path or '字幕' in movie_path or ".chs" in movie_path or '.cht' in movie_path: - cn_sub = '1' + cn_sub = True c_word = '-C' # 中文字幕影片后缀 - uncensored = 1 if is_uncensored(number) else 0 + uncensored = True if is_uncensored(number) else 0 if '流出' in movie_path or 'uncensored' in movie_path.lower(): leak_word = '-无码流出' # 无码流出影片后缀 - leak = 1 + leak = True if 'hack'.upper() in str(movie_path).upper() or '破解' in movie_path: - hack = 1 + hack = True hack_word = "-hack" # try: + # props = get_video_properties(movie_path) # 判断是否为4K视频 # if props['width'] >= 4096 or props['height'] >= 2160: - # _4k = '1' + # _4k = True # except: # pass - prestr = f"{number}{leak_word}{c_word}{hack_word}" full_nfo = Path(path) / f"{prestr}{part}.nfo" if full_nfo.is_file(): if full_nfo.read_text(encoding='utf-8').find(r'无码') >= 0: - uncensored = 1 + uncensored = True try: nfo_xml = etree.parse(full_nfo) nfo_fanart_path = nfo_xml.xpath('//fanart/text()')[0] @@ -791,15 +810,15 @@ def move_subtitles(filepath, path, multi_part, number, part, leak_word, c_word, def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=None): conf = config.getInstance() # =======================================================================初始化所需变量 - multi_part = 0 + multi_part = False part = '' leak_word = '' c_word = '' - cn_sub = '' - liuchu = '' - hack = '' + cn_sub = False + liuchu = False + hack = False hack_word = '' - _4k = '' + _4k = False # 下面被注释的变量不需要 # rootpath = os.getcwd @@ -822,11 +841,11 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N tag = json_data.get('tag') # =======================================================================判断-C,-CD后缀 if re.search('[-_]CD\d+', movie_path, re.IGNORECASE): - multi_part = 1 + multi_part = True part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper() if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path, re.I) or '中文' in movie_path or '字幕' in movie_path: - cn_sub = '1' + cn_sub = True c_word = '-C' # 中文字幕影片后缀 # 判断是否无码 @@ -835,19 +854,22 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N if '流出' in movie_path or 'uncensored' in movie_path.lower(): liuchu = '流出' - leak = 1 + leak = True leak_word = '-无码流出' # 流出影片后缀 else: - leak = 0 + leak = False if 'hack'.upper() in str(movie_path).upper() or '破解' in movie_path: - hack = 1 + hack = True hack_word = "-hack" + if '4k'.upper() in str(movie_path).upper() or '4k' in movie_path: + _4k = True + # 判断是否4k if '4K' in tag: tag.remove('4K') # 从tag中移除'4K' - + # 判断是否为无码破解 if '无码破解' in tag: tag.remove('无码破解') # 从tag中移除'无码破解' @@ -855,7 +877,7 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N # try: # props = get_video_properties(movie_path) # 判断是否为4K视频 # if props['width'] >= 4096 or props['height'] >= 2160: - # _4k = '1' + # _4k = True # except: # pass @@ -920,7 +942,7 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N pass # 裁剪图 - cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored)) + cutImage(imagecut, path, thumb_path, poster_path, bool(conf.face_uncensored_only() and not uncensored)) # 兼容Jellyfin封面图文件名规则 if multi_part and conf.jellyfin_multi_part_fanart(): @@ -932,7 +954,7 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N # Move subtitles move_status = move_subtitles(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) if move_status: - cn_sub = "1" + cn_sub = True # 添加水印 if conf.is_watermark(): add_mark(os.path.join(path, poster_path), os.path.join(path, thumb_path), cn_sub, leak, uncensored, @@ -940,7 +962,7 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path, tag, - json_data.get('actor_list'), liuchu, uncensored, hack_word + json_data.get('actor_list'), liuchu, uncensored, hack, hack_word , _4k, fanart_path, poster_path, thumb_path) elif conf.main_mode() == 2: @@ -948,13 +970,9 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N path = create_folder(json_data) # 移动文件 paste_file_to_folder_mode2(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) + # Move subtitles - move_status = move_subtitles(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) - if move_status: - cn_sub = "1" - if conf.is_watermark(): - add_mark(os.path.join(path, poster_path), os.path.join(path, thumb_path), cn_sub, leak, uncensored, hack, - _4k) + move_subtitles(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word) elif conf.main_mode() == 3: path = str(Path(movie_path).parent) @@ -998,7 +1016,7 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N # 添加水印 if conf.is_watermark(): - add_mark(os.path.join(path, poster_path), os.path.join(path, thumb_path), cn_sub, leak, uncensored, hack, + add_mark(os.path.join(path, poster_path), os.path.join(path, fanart_path), cn_sub, leak, uncensored, hack, _4k) # 兼容Jellyfin封面图文件名规则 @@ -1007,5 +1025,5 @@ def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=N # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path, - tag, json_data.get('actor_list'), liuchu, uncensored, hack_word, fanart_path, poster_path, + tag, json_data.get('actor_list'), liuchu, uncensored, hack, hack_word, _4k, fanart_path, poster_path, thumb_path) diff --git a/donate.png b/donate.png index 2407278..331347e 100644 Binary files a/donate.png and b/donate.png differ diff --git a/number_parser.py b/number_parser.py index d9f6f7d..17bfc9f 100755 --- a/number_parser.py +++ b/number_parser.py @@ -7,7 +7,7 @@ import typing G_spat = re.compile( "^\w+\.(cc|com|net|me|club|jp|tv|xyz|biz|wiki|info|tw|us|de)@|^22-sht\.me|" "^(fhd|hd|sd|1080p|720p|4K)(-|_)|" - "(-|_)(fhd|hd|sd|1080p|720p|4K|x264|x265|uncensored|leak)", + "(-|_)(fhd|hd|sd|1080p|720p|4K|x264|x265|uncensored|hack|leak)", re.IGNORECASE) @@ -54,12 +54,12 @@ def get_number(debug: bool, file_path: str) -> str: filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 lower_check = filename.lower() if 'fc2' in lower_check: - filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() + filename = lower_check.replace('--', '-').replace('_', '-').upper() filename = re.sub("[-_]cd\d{1,2}", "", filename, flags=re.IGNORECASE) if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group()) file_number = os.path.splitext(filename) - filename = re.search(r'\w+(-|_)\w+', filename, re.A) + filename = re.search(r'[\w\-_]+', filename, re.A) if filename: file_number = str(filename.group()) else: @@ -85,34 +85,7 @@ def get_number(debug: bool, file_path: str) -> str: print(f'[-]Number Parser exception: {e} [{file_path}]') return None -# modou提取number -def md(filename): - m = re.search(r'(md[a-z]{0,2}-?)(\d{2,})(-ep\d*|-\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(4)}{m.group(3) or ""}' -def mmz(filename): - m = re.search(r'(mmz-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def msd(filename): - m = re.search(r'(msd-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def mky(filename): - m = re.search(r'(mky-[a-z]{2,2}-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def yk(filename): - m = re.search(r'(yk-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def pm(filename): - m = re.search(r'(pm[a-z]?-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def fsog(filename): - m = re.search(r'(fsog-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' # 按javdb数据源的命名规范提取number G_TAKE_NUM_RULES = { @@ -126,13 +99,6 @@ G_TAKE_NUM_RULES = { 'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0], 'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()), 'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()), - r'\bmd[a-z]{0,2}-\d{2,}': md, - r'\bmmz-\d{2,}':mmz, - r'\bmsd-\d{2,}':msd, - r'\bmky-[a-z]{2,2}-\d{2,}':mky, - r'\byk-\d{2,3}': yk, - r'\bpm[a-z]?-?\d{2,}':pm, - r'\bfsog-?\d{2,}':fsog } diff --git a/py_to_exe.ps1 b/py_to_exe.ps1 index 399963b..c90068d 100644 --- a/py_to_exe.ps1 +++ b/py_to_exe.ps1 @@ -1,25 +1,25 @@ -# If you can't run this script, please execute the following command in PowerShell. -# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force - -$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1) -$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1) -$FACE_RECOGNITION_MODELS=$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | select -Last 1) - -mkdir build -mkdir __pycache__ - -pyinstaller --onefile Movie_Data_Capture.py ` - --hidden-import "ImageProcessing.cnn" ` - --python-option u ` - --add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" ` - --add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` - --add-data "$OPENCC_PATH;opencc" ` - --add-data "Img;Img" ` - --add-data "config.ini;." ` - -rmdir -Recurse -Force build -rmdir -Recurse -Force __pycache__ -rmdir -Recurse -Force Movie_Data_Capture.spec - -echo "[Make]Finish" -pause +# If you can't run this script, please execute the following command in PowerShell. +# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force + +# bugfix:set submodules find path +$Env:PYTHONPATH=$pwd.path +$PYTHONPATH=$pwd.path +mkdir build +mkdir __pycache__ + +pyinstaller --collect-submodules "scrapinglib" ` + --collect-submodules "ImageProcessing" ` + --collect-data "face_recognition_models" ` + --collect-data "cloudscraper" ` + --collect-data "opencc" ` + --add-data "Img;Img" ` + --add-data "config.ini;." ` + --onefile Movie_Data_Capture.py + + +rmdir -Recurse -Force build +rmdir -Recurse -Force __pycache__ +rmdir -Recurse -Force Movie_Data_Capture.spec + +echo "[Make]Finish" +pause diff --git a/requirements.txt b/requirements.txt index 9781404..2014f84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ requests +dlib-bin +Click +numpy +face-recognition-models lxml beautifulsoup4 pillow @@ -8,5 +12,3 @@ urllib3 certifi MechanicalSoup opencc-python-reimplemented -face_recognition -get-video-properties diff --git a/scraper.py b/scraper.py index 91b4219..7e54f52 100644 --- a/scraper.py +++ b/scraper.py @@ -99,6 +99,10 @@ def get_data_from_json( # ================================================网站规则添加结束================================================ + if json_data.get('title') == '': + print('[-]Movie Number or Title not found!') + return None + title = json_data.get('title') actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表 actor_list = [actor.strip() for actor in actor_list] # 去除空白 @@ -134,11 +138,10 @@ def get_data_from_json( tag.remove('XXXX') while 'xxx' in tag: tag.remove('xxx') - actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') - - if title == '' or number == '': - print('[-]Movie Number or Title not found!') - return None + if json_data['source'] =='pissplay': # pissplay actor为英文名,不用去除空格 + actor = str(actor_list).strip("[ ]").replace("'", '') + else: + actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') # if imagecut == '3': # DownloadFileWithFilename() @@ -266,14 +269,22 @@ def get_data_from_json( pass naming_rule = "" + original_naming_rule = "" for i in conf.naming_rule().split("+"): if i not in json_data: naming_rule += i.strip("'").strip('"') + original_naming_rule += i.strip("'").strip('"') else: item = json_data.get(i) naming_rule += item if type(item) is not list else "&".join(item) + # PATCH:处理[title]存在翻译的情况,后续NFO文件的original_name只会直接沿用naming_rule,这导致original_name非原始名 + # 理应在翻译处处理 naming_rule和original_naming_rule + if i == 'title': + item = json_data.get('original_title') + original_naming_rule += item if type(item) is not list else "&".join(item) json_data['naming_rule'] = naming_rule + json_data['original_naming_rule'] = original_naming_rule return json_data diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py index ee27a25..e2144f5 100644 --- a/scrapinglib/__init__.py +++ b/scrapinglib/__init__.py @@ -1,3 +1,2 @@ # -*- coding: utf-8 -*- - -from .api import search, getSupportedSources +from .api import search, getSupportedSources \ No newline at end of file diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 0a9c7fe..29de0aa 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -20,6 +20,8 @@ from .xcity import Xcity from .avsox import Avsox from .javlibrary import Javlibrary from .javday import Javday +from .pissplay import Pissplay +from .javmenu import Javmenu from .tmdb import Tmdb from .imdb import Imdb @@ -51,8 +53,8 @@ class Scraping: """ """ adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', - 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', - 'getchu', 'gcolle','javday' + 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', + 'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu' ] adult_func_mapping = { 'avsox': Avsox().scrape, @@ -70,7 +72,9 @@ class Scraping: 'javdb': Javdb().scrape, 'getchu': Getchu().scrape, 'javlibrary': Javlibrary().scrape, - 'javday': Javday().scrape + 'javday': Javday().scrape, + 'pissplay': Pissplay().scrape, + 'javmenu': Javmenu().scrape } general_full_sources = ['tmdb', 'imdb'] @@ -143,6 +147,14 @@ class Scraping: print(f'[-]Movie Number [{name}] not found!') return None + # If actor is anonymous, Fill in Anonymous + if len(json_data['actor']) == 0: + if config.getInstance().anonymous_fill() == True: + if "zh_" in config.getInstance().get_target_language(): + json_data['actor'] = "佚名" + else: + json_data['actor'] = "Anonymous" + return json_data def searchAdult(self, number, sources): @@ -174,13 +186,13 @@ class Scraping: break except: continue - + # javdb的封面有水印,如果可以用其他源的封面来替换javdb的封面 if 'source' in json_data and json_data['source'] == 'javdb': # search other sources other_sources = sources[sources.index('javdb') + 1:] while other_sources: - # If cover not found in other source, then skip using other sources using javdb cover instead + # If cover not found in other source, then skip using other sources using javdb cover instead try: other_json_data = self.searchAdult(number, other_sources) if other_json_data is not None and 'cover' in other_json_data and other_json_data['cover'] != '': @@ -195,12 +207,20 @@ class Scraping: other_sources = sources[sources.index(other_json_data['source']) + 1:] except: pass - + # Return if data not found in all sources if not json_data: print(f'[-]Movie Number [{number}] not found!') return None + # If actor is anonymous, Fill in Anonymous + if len(json_data['actor']) == 0: + if config.getInstance().anonymous_fill() == True: + if "zh_" in config.getInstance().get_target_language(): + json_data['actor'] = "佚名" + else: + json_data['actor'] = "Anonymous" + return json_data def checkGeneralSources(self, c_sources, name): @@ -283,4 +303,8 @@ class Scraping: return False if data["number"] is None or data["number"] == "" or data["number"] == "null": return False + if (data["cover"] is None or data["cover"] == "" or data["cover"] == "null") \ + and (data["cover_small"] is None or data["cover_small"] == "" or + data["cover_small"] == "null"): + return False return True diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index f3b4a2e..8397c72 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -31,12 +31,14 @@ class Avsox(Parser): site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href') self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') - if result1 == '' or result1 == 'null' or result1 == 'None': + if result1 == '' or result1 == 'null' or result1 == 'None' or result1.find('movie') == -1: self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_')) result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') - if result1 == '' or result1 == 'null' or result1 == 'None': + if result1 == '' or result1 == 'null' or result1 == 'None' or result1.find('movie') == -1: self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', '')) result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + if result1 == '' or result1 == 'null' or result1 == 'None' or result1.find('movie') == -1: + return None return "https:" + result1 def getNum(self, htmltree): diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 858115a..e9b25c3 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -49,13 +49,13 @@ class Fanza(Parser): self.detailurl = url + fanza_search_number url = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl}) self.htmlcode = self.getHtml(url) - if self.htmlcode != 404: + if self.htmlcode != 404 \ + and 'Sorry! This content is not available in your region.' not in self.htmlcode: self.htmltree = etree.HTML(self.htmlcode) - break - if self.htmlcode == 404: - return 404 - result = self.dictformat(self.htmltree) - return result + if self.htmltree is not None: + result = self.dictformat(self.htmltree) + return result + return 404 def getNum(self, htmltree): # for some old page, the input number does not match the page diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index 21629ea..7f11851 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -22,6 +22,7 @@ class Fc2(Parser): def extraInit(self): self.imagecut = 0 + self.allow_number_change = True def search(self, number): self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '') diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index b7b1734..20cfcd3 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -128,7 +128,7 @@ class Javbus(Parser): def getTags(self, htmltree): tags = self.getTreeElement(htmltree, self.expr_tags).split(',') - return tags[1:] + return tags[2:] def getOutline(self, htmltree): if self.morestoryline: diff --git a/scrapinglib/javday.py b/scrapinglib/javday.py index a462ba1..cb8edf7 100644 --- a/scrapinglib/javday.py +++ b/scrapinglib/javday.py @@ -39,3 +39,8 @@ class Javday(Parser): # 删除番号和网站名 result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip() return result + + def getTags(self, htmltree) -> list: + tags = super().getTags(htmltree) + return [tag for tag in tags if 'JAVDAY.TV' not in tag] + \ No newline at end of file diff --git a/scrapinglib/javmenu.py b/scrapinglib/javmenu.py new file mode 100644 index 0000000..099f314 --- /dev/null +++ b/scrapinglib/javmenu.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urljoin + +from .parser import Parser + + +class Javmenu(Parser): + source = 'javmenu' + + expr_title = '/html/head/meta[@property="og:title"]/@content' + expr_cover = '/html/head/meta[@property="og:image"]/@content' + + expr_number = '//span[contains(text(),"番號") or contains(text(),"番号")]/../a/text()' + expr_number2 = '//span[contains(text(),"番號") or contains(text(),"番号")]/../span[2]/text()' + expr_runtime = '//span[contains(text(),"時長;") or contains(text(),"时长")]/../span[2]/text()' + expr_release = '//span[contains(text(),"日期")]/../span[2]/text()' + expr_studio = '//span[contains(text(),"製作")]/../span[2]/a/text()' + + expr_actor = '//a[contains(@class,"actress")]/text()' + expr_tags = '//a[contains(@class,"genre")]/text()' + + def extraInit(self): + self.imagecut = 4 + self.uncensored = True + + def search(self, number): + self.number = number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://javmenu.com/zh/' + self.number + '/' + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.HTML(self.htmlcode) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + # 番号被分割开,需要合并后才是完整番号 + part1 = self.getTreeElement(htmltree, self.expr_number) + part2 = self.getTreeElement(htmltree, self.expr_number2) + dp_number = part1 + part2 + # NOTE 检测匹配与更新 self.number + if dp_number.upper() != self.number.upper(): + raise Exception(f'[!] {self.number}: find [{dp_number}] in javmenu, not match') + self.number = dp_number + return self.number + + def getTitle(self, htmltree): + browser_title = super().getTitle(htmltree) + # 删除番号 + number = re.findall("\d+",self.number)[1] + title = browser_title.split(number,1)[-1] + title = title.replace(' | JAV目錄大全 | 每日更新',"") + title = title.replace(' | JAV目录大全 | 每日更新',"").strip() + return title.replace(self.number, '').strip() + diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index 6e288b6..91742ff 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -6,6 +6,28 @@ from urllib.parse import urlparse, unquote from .parser import Parser +NUM_RULES3=[ + r'(mmz{2,4})-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(msd)-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(yk)-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(pm)-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(mky-[a-z]{2,2})-?(\d{2,})(-ep\d*|-\d*)?.*', +] + +# modou提取number +def change_number(number): + number = number.lower().strip() + m = re.search(r'(md[a-z]{0,2})-?(\d{2,})(-ep\d*|-\d*)?.*', number, re.I) + if m: + return f'{m.group(1)}{m.group(2).zfill(4)}{m.group(3) or ""}' + for rules in NUM_RULES3: + m = re.search(rules, number, re.I) + if m: + return f'{m.group(1)}{m.group(2).zfill(3)}{m.group(3) or ""}' + return number + + + class Madou(Parser): source = 'madou' @@ -14,12 +36,15 @@ class Madou(Parser): expr_studio = '//a[@rel="category tag"]/text()' expr_tags = '/html/head/meta[@name="keywords"]/@content' + + def extraInit(self): - self.imagecut = 0 + self.imagecut = 4 self.uncensored = True + self.allow_number_change = True def search(self, number): - self.number = number.lower().strip() + self.number = change_number(number) if self.specifiedUrl: self.detailurl = self.specifiedUrl else: @@ -65,5 +90,5 @@ class Madou(Parser): def getTags(self, htmltree): studio = self.getStudio(htmltree) - x = super().getTags(htmltree) - return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i] + tags = super().getTags(htmltree) + return [tag for tag in tags if studio not in tag and '麻豆' not in tag] diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index dbbf8e4..8c8b7fa 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -85,7 +85,7 @@ class Parser: else: self.detailurl = self.queryNumberUrl(number) if not self.detailurl: - return None + return 404 htmltree = self.getHtmlTree(self.detailurl) result = self.dictformat(htmltree) return result @@ -210,6 +210,13 @@ class Parser: def getTags(self, htmltree) -> list: alls = self.getTreeAll(htmltree, self.expr_tags) + tags = [] + for t in alls: + for tag in t.strip().split(','): + tag = tag.strip() + if tag: + tags.append(tag) + return tags return [ x.strip() for x in alls if x.strip()] def getStudio(self, htmltree): diff --git a/scrapinglib/pissplay.py b/scrapinglib/pissplay.py new file mode 100644 index 0000000..04bc898 --- /dev/null +++ b/scrapinglib/pissplay.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from .parser import Parser +from datetime import datetime + +# 搜刮 https://pissplay.com/ 中的视频 +# pissplay中的视频没有番号,所以要通过文件名搜索 +# 只用文件名和网站视频名完全一致时才可以被搜刮 +class Pissplay(Parser): + source = 'pissplay' + + expr_number = '//*[@id="video_title"]/text()' #这个网站上的视频没有番号,因此用标题代替 + expr_title = '//*[@id="video_title"]/text()' + expr_cover = '/html/head//meta[@property="og:image"]/@content' + expr_tags = '//div[@id="video_tags"]/a/text()' + expr_release = '//div[@class="video_date"]/text()' + expr_outline = '//*[@id="video_description"]/p//text()' + + def extraInit(self): + self.imagecut = 0 # 不裁剪封面 + self.specifiedSource = None + + def search(self, number): + self.number = number.strip().upper() + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + newName = re.sub(r"[^a-zA-Z0-9 ]", "", number) # 删除特殊符号 + self.detailurl = "https://pissplay.com/videos/" + newName.lower().replace(" ","-") + "/" + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + title = self.getTitle(htmltree) + return title + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + title = re.sub(r"[^a-zA-Z0-9 ]", "", title) # 删除特殊符号 + return title + + def getCover(self, htmltree): + url = super().getCover(htmltree) + if not url.startswith('http'): + url = 'https:' + url + return url + + def getRelease(self, htmltree): + releaseDate = super().getRelease(htmltree) + isoData = datetime.strptime(releaseDate, '%d %b %Y').strftime('%Y-%m-%d') + return isoData + + def getStudio(self, htmltree): + return 'PissPlay' + + def getTags(self, htmltree): + tags = self.getTreeAll(htmltree, self.expr_tags) + if 'Guests' in tags: + if tags[0] == 'Collaboration' or tags[0] == 'Toilet for a Day' or tags[0] == 'Collaboration': + del tags[1] + else: + tags = tags[1:] + return tags + + def getActors(self, htmltree) -> list: + tags = self.getTreeAll(htmltree, self.expr_tags) + if 'Guests' in tags: + if tags[0] == 'Collaboration' or tags[0] == 'Toilet for a Day' or tags[0] == 'Collaboration': + return [tags[1]] + else: + return [tags[0]] + else: + return ['Bruce and Morgan'] + + def getOutline(self, htmltree): + outline = self.getTreeAll(htmltree, self.expr_outline) + if '– Morgan xx' in outline: + num = outline.index('– Morgan xx') + outline = outline[:num] + rstring = ''.join(outline).replace("&","and") + return rstring