Merge branch 'upstream'

# Conflicts:
#	WebCrawler/fanza.py
This commit is contained in:
Deng Zhou
2022-04-29 23:53:21 +08:00
34 changed files with 25968 additions and 18336 deletions

View File

@@ -39,7 +39,7 @@ jobs:
run: | run: |
pyinstaller \ pyinstaller \
--onefile Movie_Data_Capture.py \ --onefile Movie_Data_Capture.py \
--hidden-import "ImageProcessing.hog" \ --hidden-import "ImageProcessing.cnn" \
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \ --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \
@@ -51,7 +51,7 @@ jobs:
run: | run: |
pyinstaller ` pyinstaller `
--onefile Movie_Data_Capture.py ` --onefile Movie_Data_Capture.py `
--hidden-import "ImageProcessing.hog" ` --hidden-import "ImageProcessing.cnn" `
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" ` --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" `
--add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1);face_recognition_models" ` --add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1);face_recognition_models" `

View File

@@ -18,6 +18,7 @@ from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from cloudscraper import create_scraper from cloudscraper import create_scraper
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from unicodedata import category
def getXpathSingle(htmlcode, xpath): def getXpathSingle(htmlcode, xpath):
@@ -26,7 +27,7 @@ def getXpathSingle(htmlcode, xpath):
return result1 return result1
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36' G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
@@ -69,7 +70,6 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
print('[-]Connect Failed! Please check your Proxy or Network!') print('[-]Connect Failed! Please check your Proxy or Network!')
raise Exception('Connect Failed') raise Exception('Connect Failed')
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
errors = "" errors = ""
@@ -381,7 +381,7 @@ def load_cookies(cookie_json_filename: str):
break break
if not cookies_filename: if not cookies_filename:
return None, None return None, None
return json.load(open(cookies_filename)), cookies_filename return json.loads(Path(cookies_filename).read_text(encoding='utf-8')), cookies_filename
except: except:
return None, None return None, None
@@ -466,7 +466,7 @@ def download_file_with_filename(url: str, filename: str, path: str) -> None:
os.makedirs(path) os.makedirs(path)
except: except:
print(f"[-]Fatal error! Can not make folder '{path}'") print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0) os._exit(0)
proxies = configProxy.proxies() proxies = configProxy.proxies()
headers = { headers = {
'User-Agent': G_USER_AGENT} 'User-Agent': G_USER_AGENT}
@@ -483,7 +483,7 @@ def download_file_with_filename(url: str, filename: str, path: str) -> None:
os.makedirs(path) os.makedirs(path)
except: except:
print(f"[-]Fatal error! Can not make folder '{path}'") print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0) os._exit(0)
headers = { headers = {
'User-Agent': G_USER_AGENT} 'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers) r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -519,14 +519,13 @@ def download_one_file(args) -> str:
wrapped for map function wrapped for map function
""" """
def _inner(url: str, save_path: Path): (url, save_path) = args
filebytes = get_html(url, return_type='content') filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes): if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes): with save_path.open('wb') as fpbyte:
if len(filebytes) == fpbyte.write(filebytes):
return str(save_path) return str(save_path)
return _inner(*args)
def parallel_download_files(dn_list: typing.Iterable[typing.Sequence], parallel: int = 0): def parallel_download_files(dn_list: typing.Iterable[typing.Sequence], parallel: int = 0):
""" """
@@ -567,6 +566,7 @@ def delete_all_elements_in_list(string: str, lists: typing.Iterable[str]):
new_lists.append(i) new_lists.append(i)
return new_lists return new_lists
def delete_all_elements_in_str(string_delete: str, string: str): def delete_all_elements_in_str(string_delete: str, string: str):
""" """
delete same string in given list delete same string in given list
@@ -574,4 +574,9 @@ def delete_all_elements_in_str(string_delete: str, string: str):
for i in string: for i in string:
if i == string_delete: if i == string_delete:
string = string.replace(i,"") string = string.replace(i,"")
return string return string
# print format空格填充对齐内容包含中文时的空格计算
def cnspace(v: str, n: int) -> int:
return n - [category(c) for c in v].count('Lo')

View File

@@ -1,12 +1,18 @@
import sys
sys.path.append('../')
import logging import logging
import os import os
import config import config
import importlib import importlib
from pathlib import Path
from PIL import Image from PIL import Image
import shutil import shutil
from ADC_function import file_not_exist_or_empty
def face_crop_width(filename, width, height): def face_crop_width(filename, width, height):
aspect_ratio = config.getInstance().face_aspect_ratio()
# 新宽度是高度的2/3 # 新宽度是高度的2/3
cropWidthHalf = int(height/3) cropWidthHalf = int(height/3)
try: try:
@@ -21,15 +27,15 @@ def face_crop_width(filename, width, height):
# 越界处理 # 越界处理
if cropLeft < 0: if cropLeft < 0:
cropLeft = 0 cropLeft = 0
cropRight = cropWidthHalf*2 cropRight = cropWidthHalf * aspect_ratio
elif cropRight > width: elif cropRight > width:
cropLeft = width-cropWidthHalf*2 cropLeft = width - cropWidthHalf * aspect_ratio
cropRight = width cropRight = width
return (cropLeft, 0, cropRight, height) return (cropLeft, 0, cropRight, height)
except: except:
print('[-]Not found face! ' + filename) print('[-]Not found face! ' + filename)
# 默认靠右切 # 默认靠右切
return (width-cropWidthHalf*2, 0, width, height) return (width-cropWidthHalf * aspect_ratio, 0, width, height)
def face_crop_height(filename, width, height): def face_crop_height(filename, width, height):
@@ -54,29 +60,43 @@ def face_crop_height(filename, width, height):
return (0, 0, width, cropHeight) return (0, 0, width, cropHeight)
def cutImage(imagecut, path, fanart_path, poster_path): def cutImage(imagecut, path, fanart_path, poster_path, skip_facerec=False):
conf = config.getInstance()
fullpath_fanart = os.path.join(path, fanart_path) fullpath_fanart = os.path.join(path, fanart_path)
fullpath_poster = os.path.join(path, poster_path) fullpath_poster = os.path.join(path, poster_path)
if imagecut == 1: # 剪裁大封面 aspect_ratio = conf.face_aspect_ratio()
if conf.face_aways_imagecut():
imagecut = 1
elif conf.download_only_missing_images() and not file_not_exist_or_empty(fullpath_poster):
return
# imagecut为4时同时也是有码影片 也用人脸识别裁剪封面
if imagecut == 1 or imagecut == 4: # 剪裁大封面
try: try:
img = Image.open(fullpath_fanart) img = Image.open(fullpath_fanart)
width, height = img.size width, height = img.size
if width/height > 2/3: # 如果宽度大于2 if width/height > 2/3: # 如果宽度大于2
# 以人像为中心切取 if imagecut == 4:
img2 = img.crop(face_crop_width(fullpath_fanart, width, height)) # 以人像为中心切取
img2 = img.crop(face_crop_width(fullpath_fanart, width, height))
elif skip_facerec:
# 有码封面默认靠右切
img2 = img.crop((width - int(height / 3) * aspect_ratio, 0, width, height))
else:
# 以人像为中心切取
img2 = img.crop(face_crop_width(fullpath_fanart, width, height))
elif width/height < 2/3: # 如果高度大于3 elif width/height < 2/3: # 如果高度大于3
# 从底部向上切割 # 从底部向上切割
img2 = img.crop(face_crop_height(fullpath_fanart, width, height)) img2 = img.crop(face_crop_height(fullpath_fanart, width, height))
else: # 如果等于2/3 else: # 如果等于2/3
img2 = img img2 = img
img2.save(fullpath_poster) img2.save(fullpath_poster)
print('[+]Image Cutted! ' + fullpath_poster) print(f"[+]Image Cutted! {Path(fullpath_poster).name}")
except Exception as e: except Exception as e:
print(e) print(e)
print('[-]Cover cut failed!') print('[-]Cover cut failed!')
elif imagecut == 0: # 复制封面 elif imagecut == 0: # 复制封面
shutil.copyfile(fullpath_fanart, fullpath_poster) shutil.copyfile(fullpath_fanart, fullpath_poster)
print('[+]Image Copyed! ' + fullpath_poster) print(f"[+]Image Copyed! {Path(fullpath_poster).name}")
def face_center(filename, model): def face_center(filename, model):
@@ -91,5 +111,5 @@ def face_center(filename, model):
return (0, 0) return (0, 0)
if __name__ == '__main__': if __name__ == '__main__':
cutImage(1,'H:\\test\\','12.jpg','test.jpg') cutImage(1,'z:/t/','p.jpg','o.jpg')
#cutImage(1,'H:\\test\\','12.jpg','test.jpg')

View File

@@ -1,4 +1,8 @@
import hog import sys
sys.path.append('../')
from ImageProcessing.hog import face_center as hog_face_center
def face_center(filename, model): def face_center(filename, model):
return hog.face_center(filename, model) return hog_face_center(filename, model)

View File

@@ -7,18 +7,20 @@ SHELL = /bin/bash
.DEFAULT: make .DEFAULT: make
make: make:
#@echo "[+]make prepare-dev" @echo "[+]make prepare-dev"
#sudo apt-get -y install python3.7 python3-pip #sudo apt-get -y install python3 python3-pip
#pip3 install -r requirements.txt pip3 install -r requirements.txt
#pip3 install pyinstaller pip3 install pyinstaller
#@echo "[+]Set CLOUDSCRAPER_PATH variable" #@echo "[+]Set CLOUDSCRAPER_PATH variable"
#export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1) #export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1)
@echo "[+]Pyinstaller make" @echo "[+]Pyinstaller make"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--hidden-import "ImageProcessing.cnn" \
--add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \ --add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \
--add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \ --add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \
--add-data "`python3 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1`:face_recognition_models" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?> <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!-- 说明:可使用文本编辑器打开本文件后自行编辑。 <!-- 说明:可使用文本编辑器打开本文件后自行编辑。
keyword用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。 keyword用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。
zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。当输出词为“删除”时表示遇到该关键词时在对应内容中删除该关键词--> zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。当输出词为“删除”时表示遇到该关键词时在对应内容中删除该关键词-->
@@ -575,7 +575,7 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/> <a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/>
<a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/> <a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/>
<a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/> <a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/>
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,サディスティックヴィレッジ,"/> <a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,"/>
<a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/> <a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/>
<a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/> <a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/>
<a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/> <a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/>
@@ -600,28 +600,5 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/> <a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/>
<a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/> <a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/> <a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/>
<!-- 2021-11-8 Update -->
<a zh_cn="Energy" zh_tw="Energy" jp="アイエナジー" keyword=",アイエナジー,"/>
<a zh_cn="Idea Pocket" zh_tw="Idea Pocket" jp="アイデアポケット" keyword=",アイデアポケット,"/>
<a zh_cn="AKNR" zh_tw="AKNR" jp="アキノリ" keyword=",アキノリ,"/>
<a zh_cn="Attackers" zh_tw="Attackers" jp="アタッカーズ" keyword=",アタッカーズ,"/>
<a zh_cn="Alice Japan" zh_tw="Alice Japan" jp="アリスJAPAN" keyword=",アリスJAPAN,"/>
<a zh_cn="Aurora Project Annex" zh_tw="Aurora Project Annex" jp="オーロラプロジェクト・アネックス" keyword=",オーロラプロジェクト・アネックス,"/>
<a zh_cn="Crystal 映像" zh_tw="Crystal 映像" jp="クリスタル映像" keyword=",クリスタル映像,"/>
<a zh_cn="Glory Quest" zh_tw="Glory Quest" jp="グローリークエスト" keyword=",グローリークエスト,"/>
<a zh_cn="DAS" zh_tw="DAS" jp="ダスッ!" keyword=",ダスッ!,"/>
<a zh_cn="DEEPs" zh_tw="DEEPs" jp="ディープス" keyword=",ディープス,"/>
<a zh_cn="Dogma" zh_tw="Dogma" jp="ドグマ" keyword=",ドグマ,"/>
<a zh_cn="宇宙企画" zh_tw="宇宙企画" jp="メディアステーション" keyword=",メディアステーション,"/>
<a zh_cn="WANZ FACTORY" zh_tw="WANZ FACTORY" jp="ワンズファクトリー" keyword=",ワンズファクトリー,"/>
<a zh_cn="VR PRODUCE" zh_tw="VR PRODUCE" jp="VRプロダクツ" keyword=",VRプロダクツ,VRPRODUCE,"/>
<a zh_cn="Real Works" zh_tw="Real Works" jp="レアルワークス" keyword=",レアルワークス,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="マックスエー" keyword=",マックスエー,"/>
<a zh_cn="PETERS MAX" zh_tw="PETERS MAX" jp="ピーターズMAX" keyword=",ピーターズMAX,"/>
<a zh_cn="NATURAL HIGH" zh_tw="NATURAL HIGH" jp="ナチュラルハイ" keyword=",ナチュラルハイ,"/>
<a zh_cn="MAXING" zh_tw="MAXING" jp="マキシング" keyword=",マキシング,"/>
<a zh_cn="Ms Video Group" zh_tw="Ms Video Group" jp="エムズビデオグループ" keyword=",エムズビデオグループ,"/>
<a zh_cn="Minimum" zh_tw="Minimum" jp="ミニマム" keyword=",ミニマム,"/>
<a zh_cn="WAAP Entertainment" zh_tw="WAAP Entertainment" jp="ワープエンタテインメント" keyword=",ワープエンタテインメント,"/>
<a zh_cn="pacopacomama" zh_tw="pacopacomama" jp="パコパコママ" keyword=",pacopacomama,パコパコママ,"/>
</info> </info>

View File

@@ -18,7 +18,7 @@ from opencc import OpenCC
import config import config
from ADC_function import file_modification_days, get_html, parallel_download_files from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number from number_parser import get_number
from core import core_main, moveFailedFolder from core import core_main, core_main_no_net_op, moveFailedFolder
def check_update(local_version): def check_update(local_version):
@@ -40,7 +40,7 @@ def check_update(local_version):
print("[*]======================================================") print("[*]======================================================")
def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]: def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool, bool]:
conf = config.getInstance() conf = config.getInstance()
parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.") parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
@@ -49,6 +49,8 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
# parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
parser.add_argument("-L", "--link-mode", default='', nargs='?',
help="Create movie file link. 0:moving movie file, do not create link 1:soft link 2:try hard link first")
default_logdir = str(Path.home() / '.mlogs') default_logdir = str(Path.home() / '.mlogs')
parser.add_argument("-o", "--log-dir", dest='logdir', default=default_logdir, nargs='?', parser.add_argument("-o", "--log-dir", dest='logdir', default=default_logdir, nargs='?',
help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
@@ -60,12 +62,22 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
help="Override nfo_skip_days value in config.") help="Override nfo_skip_days value in config.")
parser.add_argument("-c", "--stop-counter", dest='cnt', default='', nargs='?', parser.add_argument("-c", "--stop-counter", dest='cnt', default='', nargs='?',
help="Override stop_counter value in config.") help="Override stop_counter value in config.")
parser.add_argument("-R", "--rerun-delay", dest='delaytm', default='', nargs='?',
help="Delay (eg. 1h10m30s or 60 (second)) time and rerun, until all movies proceed. Note: stop_counter value in config or -c must none zero.")
parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format( parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
parser.add_argument("-a", "--auto-exit", action="store_true", parser.add_argument("-a", "--auto-exit", action="store_true",
help="Auto exit after program complete") help="Auto exit after program complete")
parser.add_argument("-g", "--debug", action="store_true", parser.add_argument("-g", "--debug", action="store_true",
help="Turn on debug mode to generate diagnostic log for issue report.") help="Turn on debug mode to generate diagnostic log for issue report.")
parser.add_argument("-N", "--no-network-operation", action="store_true",
help="No network query, do not get metadata, for cover cropping purposes, only takes effect when main mode is 3.")
parser.add_argument("-w", "--website", dest='site', default='', nargs='?',
help="Override [priority]website= in config.")
parser.add_argument("-D", "--download-images", dest='dnimg', action="store_true",
help="Override [common]download_only_missing_images=0 force invoke image downloading.")
parser.add_argument("-C", "--config-override", dest='cfgcmd', default='', nargs='?',
help="Common use config override. grammar: section:key=value[;[section:]key=value] eg. 'de:s=1' or 'debug_mode:switch=1' override[debug_mode]switch=1")
parser.add_argument("-z", "--zero-operation", dest='zero_op', action="store_true", parser.add_argument("-z", "--zero-operation", dest='zero_op', action="store_true",
help="""Only show job list of files and numbers, and **NO** actual operation help="""Only show job list of files and numbers, and **NO** actual operation
is performed. It may help you correct wrong numbers before real job.""") is performed. It may help you correct wrong numbers before real job.""")
@@ -73,24 +85,40 @@ is performed. It may help you correct wrong numbers before real job.""")
args = parser.parse_args() args = parser.parse_args()
def get_natural_number_or_none(value): def set_natural_number_or_none(sk, value):
return int(value) if isinstance(value, str) and value.isnumeric() and int(value) >= 0 else None if isinstance(value, str) and value.isnumeric() and int(value) >= 0:
conf.set_override(f'{sk}={value}')
def get_str_or_none(value): def set_str_or_none(sk, value):
return value if isinstance(value, str) and len(value) else None if isinstance(value, str) and len(value):
conf.set_override(f'{sk}={value}')
def get_bool_or_none(value): def set_bool_or_none(sk, value):
return True if isinstance(value, bool) and value else None if isinstance(value, bool) and value:
conf.set_override(f'{sk}=1')
config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode) set_natural_number_or_none("common:main_mode", args.main_mode)
config.G_conf_override["common:source_folder"] = get_str_or_none(args.path) set_natural_number_or_none("common:link_mode", args.link_mode)
config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit) set_str_or_none("common:source_folder", args.path)
config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days) set_bool_or_none("common:auto_exit", args.auto_exit)
config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt) set_natural_number_or_none("common:nfo_skip_days", args.days)
config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list) set_natural_number_or_none("common:stop_counter", args.cnt)
config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug) set_bool_or_none("common:ignore_failed_list", args.ignore_failed_list)
set_str_or_none("common:rerun_delay", args.delaytm)
set_str_or_none("priority:website", args.site)
if isinstance(args.dnimg, bool) and args.dnimg:
conf.set_override("common:download_only_missing_images=0")
set_bool_or_none("debug_mode:switch", args.debug)
if isinstance(args.cfgcmd, str) and len(args.cfgcmd.strip()):
conf.set_override(args.cfgcmd.strip())
return args.file, args.number, args.logdir, args.regexstr, args.zero_op no_net_op = False
if conf.main_mode() == 3:
no_net_op = args.no_network_operation
if no_net_op:
conf.set_override("common:stop_counter=0;rerun_delay=0s;face:aways_imagecut=1")
return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op
class OutLogger(object): class OutLogger(object):
@@ -113,9 +141,12 @@ class OutLogger(object):
self.log.write(msg) self.log.write(msg)
def flush(self): def flush(self):
self.term.flush() if 'flush' in dir(self.term):
self.log.flush() self.term.flush()
os.fsync(self.log.fileno()) if 'flush' in dir(self.log):
self.log.flush()
if 'fileno' in dir(self.log):
os.fsync(self.log.fileno())
def close(self): def close(self):
if self.term is not None: if self.term is not None:
@@ -244,39 +275,42 @@ def close_logfile(logdir: str):
except: except:
pass pass
# 第三步,月合并到年 # 第三步,月合并到年
if today.month < 4: for i in range(1):
return if today.month < 4:
mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{6}$', f.stem, re.A)] break
if not mons or not len(mons): mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{6}$', f.stem, re.A)]
return if not mons or not len(mons):
mons.sort() break
deadline_year = f'mdc_{today.year - 1}13' mons.sort()
year_merge = [f for f in mons if f.stem < deadline_year] deadline_year = f'mdc_{today.year - 1}13'
if not year_merge or not len(year_merge): year_merge = [f for f in mons if f.stem < deadline_year]
return if not year_merge or not len(year_merge):
toyear = len('12.txt') # cut length mdc_2020|12.txt break
for f in year_merge: toyear = len('12.txt') # cut length mdc_2020|12.txt
try: for f in year_merge:
year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt try:
with open(year_file_name, 'a', encoding='utf-8') as y: year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt
y.write(f.read_text(encoding='utf-8')) with open(year_file_name, 'a', encoding='utf-8') as y:
f.unlink(missing_ok=True) y.write(f.read_text(encoding='utf-8'))
except: f.unlink(missing_ok=True)
pass except:
pass
# 第四步,压缩年志 如果有压缩需求请自行手工压缩或者使用外部脚本来定时完成。推荐nongnu的lzip对于 # 第四步,压缩年志 如果有压缩需求请自行手工压缩或者使用外部脚本来定时完成。推荐nongnu的lzip对于
# 这种粒度的文本日志压缩比是目前最好的。lzip -9的运行参数下日志压缩比要高于xz -9而且内存占用更少 # 这种粒度的文本日志压缩比是目前最好的。lzip -9的运行参数下日志压缩比要高于xz -9而且内存占用更少
# 多核利用率更高(plzip多线程版本)解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右, # 多核利用率更高(plzip多线程版本)解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右,
# 100MB的日志文件能缩小到3.7MB。 # 100MB的日志文件能缩小到3.7MB。
return filepath
def signal_handler(*args): def signal_handler(*args):
print('[!]Ctrl+C detected, Exit.') print('[!]Ctrl+C detected, Exit.')
sys.exit(9) os._exit(9)
def sigdebug_handler(*args): def sigdebug_handler(*args):
config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"] conf = config.getInstance()
print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF')) conf.set_override(f"debug_mode:switch={int(not conf.debug())}")
print(f"[!]Debug {('oFF', 'On')[int(conf.debug())]}")
# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理提示跳过视频总数调试模式(-g)下详细被跳过文件,跳过小广告 # 新增失败文件列表跳过处理,及.nfo修改天数跳过处理提示跳过视频总数调试模式(-g)下详细被跳过文件,跳过小广告
@@ -285,7 +319,7 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
main_mode = conf.main_mode() main_mode = conf.main_mode()
debug = conf.debug() debug = conf.debug()
nfo_skip_days = conf.nfo_skip_days() nfo_skip_days = conf.nfo_skip_days()
soft_link = conf.soft_link() link_mode = conf.link_mode()
file_type = conf.media_type().lower().split(",") file_type = conf.media_type().lower().split(",")
trailerRE = re.compile(r'-trailer\.', re.IGNORECASE) trailerRE = re.compile(r'-trailer\.', re.IGNORECASE)
cliRE = None cliRE = None
@@ -296,7 +330,7 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
pass pass
failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt' failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt'
failed_set = set() failed_set = set()
if (main_mode == 3 or soft_link) and not conf.ignore_failed_list(): if (main_mode == 3 or link_mode) and not conf.ignore_failed_list():
try: try:
flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines() flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines()
failed_set = set(flist) failed_set = set(flist)
@@ -327,20 +361,24 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
print('[!]Skip failed movie:', absf) print('[!]Skip failed movie:', absf)
continue continue
is_sym = full_name.is_symlink() is_sym = full_name.is_symlink()
if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标 if main_mode != 3 and (is_sym or (full_name.stat().st_nlink > 1 and not conf.scan_hardlink())): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标
continue # file is symlink or hardlink(Linux/NTFS/Darwin) continue # 模式不等于3下跳过软连接和未配置硬链接刮削
# 调试用0字节样本允许通过去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB) # 调试用0字节样本允许通过去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size直接赋0跳过小视频检测 movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size直接赋0跳过小视频检测
if 0 < movie_size < 125829120: # 1024*1024*120=125829120 if 0 < movie_size < 125829120: # 1024*1024*120=125829120
continue continue
if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name): if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
continue continue
if main_mode == 3 and nfo_skip_days > 0 and file_modification_days( if main_mode == 3:
full_name.with_suffix('.nfo')) <= nfo_skip_days: nfo = full_name.with_suffix('.nfo')
skip_nfo_days_cnt += 1 if not nfo.is_file():
if debug: if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'") print(f"[!]Metadata {nfo.name} not found for '{absf}'")
continue elif nfo_skip_days > 0 and file_modification_days(nfo) <= nfo_skip_days:
skip_nfo_days_cnt += 1
if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
continue
total.append(absf) total.append(absf)
if skip_failed_cnt: if skip_failed_cnt:
@@ -348,13 +386,13 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
if skip_nfo_days_cnt: if skip_nfo_days_cnt:
print( print(
f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
if nfo_skip_days <= 0 or not soft_link or main_mode == 3: if nfo_skip_days <= 0 or not link_mode or main_mode == 3:
return total return total
# 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的 # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的
skip_numbers = set() skip_numbers = set()
success_folder = Path(conf.success_folder()).resolve() success_folder = Path(conf.success_folder()).resolve()
for f in success_folder.glob(r'**/*'): for f in success_folder.glob(r'**/*'):
if not re.match(r'\.nfo', f.suffix, re.IGNORECASE): if not re.match(r'\.nfo$', f.suffix, re.IGNORECASE):
continue continue
if file_modification_days(f) > nfo_skip_days: if file_modification_days(f) > nfo_skip_days:
continue continue
@@ -388,7 +426,7 @@ def create_failed_folder(failed_folder: str):
os.makedirs(failed_folder) os.makedirs(failed_folder)
except: except:
print(f"[-]Fatal error! Can not make folder '{failed_folder}'") print(f"[-]Fatal error! Can not make folder '{failed_folder}'")
sys.exit(0) os._exit(0)
def rm_empty_folder(path): def rm_empty_folder(path):
@@ -405,38 +443,44 @@ def rm_empty_folder(path):
pass pass
def create_data_and_move(file_path: str, zero_op, oCC): def create_data_and_move(movie_path: str, zero_op: bool, no_net_op: bool, oCC):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
debug = config.getInstance().debug() debug = config.getInstance().debug()
n_number = get_number(debug, os.path.basename(file_path)) n_number = get_number(debug, os.path.basename(movie_path))
file_path = os.path.abspath(file_path) movie_path = os.path.abspath(movie_path)
if debug is True: if debug is True:
print(f"[!] [{n_number}] As Number making data for '{file_path}'") print(f"[!] [{n_number}] As Number making data for '{movie_path}'")
if zero_op: if zero_op:
return return
if n_number: if n_number:
core_main(file_path, n_number, oCC) if no_net_op:
core_main_no_net_op(movie_path, n_number)
else:
core_main(movie_path, n_number, oCC)
else: else:
print("[-] number empty ERROR") print("[-] number empty ERROR")
moveFailedFolder(file_path) moveFailedFolder(movie_path)
print("[*]======================================================") print("[*]======================================================")
else: else:
try: try:
print(f"[!] [{n_number}] As Number making data for '{file_path}'") print(f"[!] [{n_number}] As Number making data for '{movie_path}'")
if zero_op: if zero_op:
return return
if n_number: if n_number:
core_main(file_path, n_number, oCC) if no_net_op:
core_main_no_net_op(movie_path, n_number)
else:
core_main(movie_path, n_number, oCC)
else: else:
raise ValueError("number empty") raise ValueError("number empty")
print("[*]======================================================") print("[*]======================================================")
except Exception as err: except Exception as err:
print(f"[-] [{file_path}] ERROR:") print(f"[-] [{movie_path}] ERROR:")
print('[-]', err) print('[-]', err)
try: try:
moveFailedFolder(file_path) moveFailedFolder(movie_path)
except Exception as err: except Exception as err:
print('[!]', err) print('[!]', err)
@@ -455,7 +499,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
print("[-] [{}] ERROR:".format(file_path)) print("[-] [{}] ERROR:".format(file_path))
print('[-]', err) print('[-]', err)
if conf.soft_link(): if conf.link_mode():
print("[-]Link {} to failed folder".format(file_path)) print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, os.path.join(conf.failed_folder(), file_name)) os.symlink(file_path, os.path.join(conf.failed_folder(), file_name))
else: else:
@@ -466,23 +510,14 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
print('[!]', err) print('[!]', err)
def main(): def main(args: tuple) -> Path:
version = '6.0.2' (single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op) = args
urllib3.disable_warnings() # Ignore http proxy warning conf = config.getInstance()
# Read config.ini first, in argparse_function() need conf.failed_folder()
conf = config.Config("config.ini")
# Parse command line args
single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version)
main_mode = conf.main_mode() main_mode = conf.main_mode()
folder_path = "" folder_path = ""
if main_mode not in (1, 2, 3): if main_mode not in (1, 2, 3):
print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
sys.exit(4) os._exit(4)
signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler)
if sys.platform == 'win32': if sys.platform == 'win32':
@@ -508,8 +543,8 @@ def main():
print(f"[+]Load Config file '{conf.ini_path}'.") print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug(): if conf.debug():
print('[+]Enable debug') print('[+]Enable debug')
if conf.soft_link(): if conf.link_mode() in (1, 2):
print('[!]Enable soft link') print('[!]Enable {} link'.format(('soft', 'hard')[conf.link_mode() - 1]))
if len(sys.argv) > 1: if len(sys.argv) > 1:
print('[!]CmdLine:', " ".join(sys.argv[1:])) print('[!]CmdLine:', " ".join(sys.argv[1:]))
print('[+]Main Working mode ## {}: {} ## {}{}{}' print('[+]Main Working mode ## {}: {} ## {}{}{}'
@@ -521,7 +556,10 @@ def main():
) )
if conf.update_check(): if conf.update_check():
check_update(version) try:
check_update(version)
except Exception as e:
print('[-]Update check failed!',e)
create_failed_folder(conf.failed_folder()) create_failed_folder(conf.failed_folder())
@@ -539,15 +577,21 @@ def main():
if file_modification_days(str(v)) >= conf.mapping_table_validity(): if file_modification_days(str(v)) >= conf.mapping_table_validity():
print("[+]Mapping Table Out of date! Remove", str(v)) print("[+]Mapping Table Out of date! Remove", str(v))
os.remove(str(v)) os.remove(str(v))
res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists())) try:
for i, fp in enumerate(res, start=1): res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists()))
if fp and len(fp): for i, fp in enumerate(res, start=1):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}") if fp and len(fp):
else: print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed") else:
print("[-] --- AUTO EXIT AFTER 30s !!! --- ") print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
time.sleep(30) except Exception as e:
os._exit(-1) print("[!] ==================== ERROR ====================")
print("[!] " + "Mapping Table Download FAILED".center(47))
print("[!] " + "无法连接github".center(47))
print("[!] " + "请过几小时再试试".center(47))
print("[-] " + "------ AUTO EXIT AFTER 30s !!! ------ ".center(47))
time.sleep(30)
os._exit(-1)
# create OpenCC converter # create OpenCC converter
ccm = conf.cc_convert_mode() ccm = conf.cc_convert_mode()
@@ -587,7 +631,7 @@ def main():
percentage = str(count / int(count_all) * 100)[:4] + '%' percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -',
time.strftime("%H:%M:%S"))) time.strftime("%H:%M:%S")))
create_data_and_move(movie_path, zero_op, oCC) create_data_and_move(movie_path, zero_op, no_net_op, oCC)
if count >= stop_count: if count >= stop_count:
print("[!]Stop counter triggered!") print("[!]Stop counter triggered!")
break break
@@ -605,14 +649,68 @@ def main():
print("[+]All finished!!!") print("[+]All finished!!!")
close_logfile(logdir) return close_logfile(logdir)
if not conf.auto_exit():
input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0) def 分析日志文件(logfile):
try:
if not (isinstance(logfile, Path) and logfile.is_file()):
raise FileNotFoundError('log file not found')
logtxt = logfile.read_text(encoding='utf-8')
扫描电影数 = int(re.findall(r'\[\+]Find (.*) movies\.', logtxt)[0])
已处理 = int(re.findall(r'\[1/(.*?)] -', logtxt)[0])
完成数 = logtxt.count(r'[+]Wrote!')
return 扫描电影数, 已处理, 完成数
except:
return None, None, None
def period(delta, pattern):
d = {'d': delta.days}
d['h'], rem = divmod(delta.seconds, 3600)
d['m'], d['s'] = divmod(rem, 60)
return pattern.format(**d)
if __name__ == '__main__': if __name__ == '__main__':
multiprocessing.freeze_support() version = '6.1.1'
main() urllib3.disable_warnings() # Ignore http proxy warning
app_start = time.time()
# Read config.ini first, in argparse_function() need conf.failed_folder()
conf = config.Config("config.ini")
# Parse command line args
args = tuple(argparse_function(version))
再运行延迟 = conf.rerun_delay()
if 再运行延迟 > 0 and conf.stop_counter() > 0:
while True:
try:
logfile = main(args)
(扫描电影数, 已处理, 完成数) = 分析结果元组 = tuple(分析日志文件(logfile))
if all(isinstance(v, int) for v in 分析结果元组):
剩余个数 = 扫描电影数 - 已处理
总用时 = timedelta(seconds = time.time() - app_start)
print(f'All movies:{扫描电影数} processed:{已处理} successes:{完成数} remain:{剩余个数}' +
' Elapsed time {}'.format(
period(总用时, "{d} day {h}:{m:02}:{s:02}") if 总用时.days == 1
else period(总用时, "{d} days {h}:{m:02}:{s:02}") if 总用时.days > 1
else period(总用时, "{h}:{m:02}:{s:02}")))
if 剩余个数 == 0:
break
下次运行 = datetime.now() + timedelta(seconds=再运行延迟)
print(f'Next run time: {下次运行.strftime("%H:%M:%S")}, rerun_delay={再运行延迟}, press Ctrl+C stop run.')
time.sleep(再运行延迟)
else:
break
except:
break
else:
main(args)
if not conf.auto_exit():
if sys.platform == 'win32':
input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0)

View File

@@ -25,7 +25,7 @@ CLI 版本
# 文档 # 文档
* [官方教程WIKI](https://github.com/yoshiko2/Movie_Data_Capture/wiki) * [官方教程WIKI](https://github.com/yoshiko2/Movie_Data_Capture/wiki)
* [VergilGao's Docker部署](https://github.com/VergilGao/docker-avdc) * [VergilGao's Docker部署](https://github.com/VergilGao/docker-mdc)
# 下载 # 下载
* [Releases](https://github.com/yoshiko2/Movie_Data_Capture/releases/latest) * [Releases](https://github.com/yoshiko2/Movie_Data_Capture/releases/latest)
@@ -36,43 +36,40 @@ CLI 版本
# 申明 # 申明
当你查阅、下载了本项目源代码或二进制程序,即代表你接受了以下条款 当你查阅、下载了本项目源代码或二进制程序,即代表你接受了以下条款
*软件仅供技术交流,学术交流使用 *项目和项目成果仅供技术,学术交流和Python3性能测试使用
* **请勿在墙内的社交平台上宣传此项目** * **请勿在墙内的社交平台上宣传此项目**
*软件作者编写出该软件旨在学习 Python ,提高编程水平 *项目贡献者编写该项目旨在学习Python3 ,提高编程水平
*软件不提供任何影片下载的线索 *项目不提供任何影片下载的线索
* 用户在使用本软件前,请用户了解并遵守当地法律法规,如果本软件使用过程中存在违反当地法律法规的行为,请勿使用该软件 * 用户在使用本项目和项目成果前,请用户了解并遵守当地法律法规,如果本项目及项目成果使用过程中存在违反当地法律法规的行为,请勿使用该项目及项目成果
* 用户在使用本软件时,若用户在当地产生一切违法行为由用户承担 * 用户在使用本项目和项目成果时,若用户在当地产生一切违法行为由用户承担
* 严禁用户将本软件使用于商业和个人其他意图 * 严禁用户将本项目和项目成果使用于商业和个人其他意图
* 源代码和二进制程序请在下载后24小时内删除 * 源代码和二进制程序请在下载后24小时内删除
* 出售源码者的母亲会升天 * 用户使用本项目及项目成果所造成的一切后果由用户自行承担,贡献者概不负责
* 本项目发起者yoshiko2保留最终决定权和最终解释权 * 若用户不同意上述条款任意一条,请勿使用本项目和项目成果
* 若用户不同意上述条款任意一条,请勿使用本软件
--- ---
When you view and download the source code or binary program of this project, it means that you have accepted the following terms When you view and download the source code or binary program of this project, it means that you have accepted the following terms
* This software is only for technical exchange and academic exchange * This project is only for technical exchange, academic exchange and Python3 performance test
* **Please do not promote this project on popular social platforms** * **Please do not promote this project on popular social platforms**
* The software author wrote this software to learn Python and improve programming * The project contributors wrote this project to learn Python and improve programming
* This software does not provide any clues for video download * This project does not provide any clues for video download
* Before using this software, please understand and abide by local laws and regulations. If there is any violation of local laws and regulations during the use of this software, * please do not use this software * Before using this project results, please understand and abide by local laws and regulations. If there is any violation of local laws and regulations during the use of this project results, * please do not use this project results
* When the user uses this software, if the user has any illegal acts in the local area, the user shall bear * When the user uses this project results, if the user has any illegal acts in the local area, the user shall bear
* It is strictly forbidden for users to use this software for commercial and personal intentions * It is strictly forbidden for users to use this project and project results for commercial and personal intentions
* Please delete the source code and binary program within 24 hours after downloading * Please delete the source code and binary program within 24 hours after downloading
* The mother of the source seller will die * All consequences caused by the user's use of this project and project results shall be borne by the user, and the contributors shall not be responsible
* The author of this software yoshiko2 reserves the right of final decision and final interpretation * If the user does not agree with any of the above terms, please do not use this project results and project
* If the user does not agree with any of the above terms, please do not use this software
--- ---
本プロジェクトのソースコード、バイナリファイルをダウンロード、または表示するしたうえで、あなたは本規約に同意したものと見なします。 本プロジェクトのソースコード、バイナリファイルをダウンロード、または表示するしたうえで、あなたは本規約に同意したものと見なします。
* このソフトウェアは、開発技術学習することのみに使用できます。 * このプロジェクトは、開発技術学習、Python3性能テストすることのみに使用できます。
* **ソーシャルメディアで本プロジェクトの宣伝をご遠慮ください** * **ソーシャルメディアで本プロジェクトの宣伝をご遠慮ください**
* 者はPythonの勉強と技術力の向上のために、このソフトウェアを作成しました * 貢献者はPythonの勉強と技術力の向上のために、このソフトウェアを作成しました
*ソフトウェアは、あらゆる動画ダウンロード機能一切提供しません *プロジェクトは、あらゆる動画ダウンロード機能一切提供しません
*ソフトウェアを使用する前に、現地の法律規範をよく理解する必要があります。あなたは、適用される現地の法令を順守する責任を負います *プロジェクトとプロジェクトの成果を使用する前に、現地の法律規範をよく理解する必要があります。あなたは、適用される現地の法令を順守する責任を負います
*ソフトウェアを使用した結果生じた損害や法的責任につきまして作者は一切責任を負いません *プロジェクトとプロジェクトの成果を使用した結果生じた損害や法的責任につきまして作者は一切責任を負いません
*ソフトウェアを商用、業務、その他の営利目的のために使用することは一切禁止します。 *プロジェクトとプロジェクトの成果を商用、業務、その他の営利目的のために使用することは一切禁止します。
* 本プロジェクトのソースコード、バイナリファイルをダウンロードした場合、24時間以内に削除してください * 本プロジェクトのソースコード、バイナリファイルをダウンロードした場合、24時間以内に削除してください
* 元売り手の母親が天に召される * ユーザーによるこのプロジェクトの使用およびプロジェクトの結果によって引き起こされるすべての結果は、ユーザーが負担するものとし、寄稿者は責任を負わないものとします。
* 最終解釈権は作者yoshiko2に属します
* 本規約およびすべての適用法、規約および規則を遵守する場合にのみ本ソフトウェアを使用することができます * 本規約およびすべての適用法、規約および規則を遵守する場合にのみ本ソフトウェアを使用することができます

View File

@@ -24,6 +24,7 @@ from . import carib
from . import fc2club from . import fc2club
from . import mv91 from . import mv91
from . import madou from . import madou
from . import gcolle
def get_data_state(data: dict) -> bool: # 元数据获取失败检测 def get_data_state(data: dict) -> bool: # 元数据获取失败检测
@@ -62,7 +63,8 @@ def get_data_from_json(file_number, oCC):
"carib": carib.main, "carib": carib.main,
"fc2club": fc2club.main, "fc2club": fc2club.main,
"mv91": mv91.main, "mv91": mv91.main,
"madou": madou.main "madou": madou.main,
"gcolle": gcolle.main,
} }
conf = config.getInstance() conf = config.getInstance()
@@ -91,6 +93,8 @@ def get_data_from_json(file_number, oCC):
sources.insert(0, sources.pop(sources.index("fc2"))) sources.insert(0, sources.pop(sources.index("fc2")))
if "fc2club" in sources: if "fc2club" in sources:
sources.insert(0, sources.pop(sources.index("fc2club"))) sources.insert(0, sources.pop(sources.index("fc2club")))
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources.insert(0, sources.pop(sources.index("gcolle")))
elif "dlsite" in sources and ( elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number "rj" in lo_file_number or "vj" in lo_file_number
): ):
@@ -100,6 +104,12 @@ def get_data_from_json(file_number, oCC):
sources.insert(0, sources.pop(sources.index("javdb"))) sources.insert(0, sources.pop(sources.index("javdb")))
if "xcity" in sources: if "xcity" in sources:
sources.insert(0, sources.pop(sources.index("xcity"))) sources.insert(0, sources.pop(sources.index("xcity")))
if "madou" in sources:
sources.insert(0, sources.pop(sources.index("madou")))
elif "madou" in sources and (
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources.insert(0, sources.pop(sources.index("madou")))
# check sources in func_mapping # check sources in func_mapping
todel = [] todel = []
@@ -124,7 +134,10 @@ def get_data_from_json(file_number, oCC):
for source in sources: for source in sources:
if conf.debug() == True: if conf.debug() == True:
print('[+]select', source) print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) try:
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
except:
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
# if any service return a valid return, break # if any service return a valid return, break
if get_data_state(json_data): if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'") print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -136,7 +149,10 @@ def get_data_from_json(file_number, oCC):
try: try:
if conf.debug() == True: if conf.debug() == True:
print('[+]select', source) print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number)) try:
json_data = json.loads(func_mapping[source](file_number))
except:
json_data = func_mapping[source](file_number)
# if any service return a valid return, break # if any service return a valid return, break
if get_data_state(json_data): if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'") print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -242,8 +258,8 @@ def get_data_from_json(file_number, oCC):
if json_data[translate_value] == "": if json_data[translate_value] == "":
continue continue
if translate_value == "title": if translate_value == "title":
title_dict = json.load( title_dict = json.loads(
open(str(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json'), 'r', encoding="utf-8")) (Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
try: try:
json_data[translate_value] = title_dict[number] json_data[translate_value] = title_dict[number]
continue continue

View File

@@ -5,6 +5,7 @@ from lxml import etree
import json import json
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline from WebCrawler.storyline import getStoryline
from WebCrawler.crawler import *
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -17,95 +18,64 @@ def getActorPhoto(html):
p2 = {t: l} p2 = {t: l}
d.update(p2) d.update(p2)
return d return d
def getTitle(html):
try:
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '')
except:
return ''
def getActor(html): def getActor(html):
a = html.xpath('//a[@class="avatar-box"]') a = html.xpath('//a[@class="avatar-box"]')
d = [] d = []
for i in a: for i in a:
d.append(i.find('span').text) d.append(i.find('span').text)
return d return d
def getStudio(html):
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1
def getRuntime(html):
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1
def getLabel(html):
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1
def getNum(html):
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1
def getYear(release):
try:
result = str(re.search('\d{4}',release).group())
return result
except:
return release
def getRelease(html):
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1
def getCover(html):
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result
def getCover_small(html): def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result return result
def getTag(html): def getTag(html):
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x[2:]] if len(x) > 2 else [] return [i.strip() for i in x[2:]] if len(x) > 2 else []
def getSeries(html):
try:
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
return ''
def main(number): def main(number):
html = get_html('https://tellme.pw/avsox') html = get_html('https://tellme.pw/avsox')
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0] site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
a = get_html(site + '/cn/search/' + number) a = get_html(site + '/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser()) html = Crawler(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_')) a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser()) html = Crawler(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', '')) a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser()) html = Crawler(a)
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
detail = get_html("https:" + result1) detail = get_html("https:" + result1)
lx = etree.fromstring(detail, etree.HTMLParser()) lx = etree.fromstring(detail, etree.HTMLParser())
avsox_crawler2 = Crawler(a)
avsox_crawler = Crawler(detail)
try: try:
new_number = getNum(lx) new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
if new_number.upper() != number.upper(): if new_number.upper() != number.upper():
raise ValueError('number not found') raise ValueError('number not found')
title = getTitle(lx).strip(new_number) title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
dic = { dic = {
'actor': getActor(lx), 'actor': getActor(lx),
'title': title, 'title': title,
'studio': getStudio(lx), 'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
'outline': getStoryline(number, title), 'outline': getStoryline(number, title),
'runtime': getRuntime(lx), 'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
'director': '', # 'director': '', #
'release': getRelease(lx), 'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
'number': new_number, 'number': new_number,
'cover': getCover(lx), 'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
'cover_small': getCover_small(html), #'cover_small' : getCover_small(html),
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
'imagecut': 3, 'imagecut': 3,
'tag': getTag(lx), 'tag': getTag(lx),
'label': getLabel(lx), 'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
'year': getYear(getRelease(lx)), 'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
'actor_photo': getActorPhoto(lx), 'actor_photo': getActorPhoto(lx),
'website': "https:" + result1, 'website': "https:" + result1,
'source': 'avsox.py', 'source': 'avsox.py',
'series': getSeries(lx), 'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
} }
except Exception as e: except Exception as e:
if config.getInstance().debug(): if config.getInstance().debug():

View File

@@ -40,6 +40,7 @@ def main(number: str) -> json:
'website': f'{G_SITE}/moviepages/{number}/index.html', 'website': f'{G_SITE}/moviepages/{number}/index.html',
'source': 'carib.py', 'source': 'carib.py',
'series': get_series(lx), 'series': get_series(lx),
'无码': True
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js return js
@@ -59,7 +60,7 @@ def get_year(lx: html.HtmlElement) -> str:
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title) g = getStoryline(number, title, 无码=True)
if len(g): if len(g):
return g return g
return o return o

28
WebCrawler/crawler.py Normal file
View File

@@ -0,0 +1,28 @@
from lxml import etree
class Crawler:
def __init__(self,htmlcode):
self.html = etree.HTML(htmlcode)
def getString(self,_xpath):
if _xpath == "":
return ""
result = self.html.xpath(_xpath)
try:
return result[0]
except:
return ""
def getStrings(self,_xpath):
result = self.html.xpath(_xpath)
try:
return result
except:
return ""
def getOutline(self,_xpath):
result = self.html.xpath(_xpath)
try:
return "\n".join(result)
except:
return ""

View File

@@ -1,15 +1,14 @@
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup
import sys import sys
sys.path.append('../') sys.path.append('../')
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html')) #print(get_html('https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html'))
#title //*[@id="work_name"]/a/text() #title /html/head/title/text()
#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text() #studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
#release //th[contains(text(),"販売日")]/../td/a/text() #release //th[contains(text(),"販売日")]/../td/a/text()
#story //th[contains(text(),"シナリオ")]/../td/a/text() #story //th[contains(text(),"シナリオ")]/../td/a/text()
@@ -18,14 +17,14 @@ from ADC_function import *
#jianjie //*[@id="main_inner"]/div[3]/text() #jianjie //*[@id="main_inner"]/div[3]/text()
#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src #photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src
#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html #https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html
def getTitle(a): def getTitle(html):
html = etree.fromstring(a, etree.HTMLParser()) result = str(html.xpath('/html/head/title/text()')[0])
result = html.xpath('//*[@id="work_name"]/a/text()')[0] result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
return result return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()') result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except: except:
@@ -38,8 +37,7 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
p={i:''} p={i:''}
d.update(p) d.update(p)
return d return d
def getStudio(a): def getStudio(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
try: try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -53,8 +51,7 @@ def getRuntime(a):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
try: try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -69,12 +66,10 @@ def getYear(getRelease):
return result return result
except: except:
return getRelease return getRelease
def getRelease(a): def getRelease(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0] result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','') return result1.replace('','-').replace('','-').replace('','')
def getTag(a): def getTag(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()') result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result return result
@@ -96,26 +91,22 @@ def getCover_small(a, index=0):
if not 'https' in result: if not 'https' in result:
result = 'https:' + result result = 'https:' + result
return result return result
def getCover(htmlcode): def getCover(html):
html = etree.fromstring(htmlcode, etree.HTMLParser()) result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0] return result.replace('.webp', '.jpg')
return result def getDirector(html):
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0] result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except: except:
result = '' result = ''
return result return result
def getOutline(htmlcode): def getOutline(html):
html = etree.fromstring(htmlcode, etree.HTMLParser())
total = [] total = []
result = html.xpath('//*[@id="main_inner"]/div[3]/text()') result = html.xpath('//*[@class="work_parts_area"]/p/text()')
for i in result: for i in result:
total.append(i.strip('\r\n')) total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a): def getSeries(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
try: try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0] result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -127,28 +118,28 @@ def getSeries(a):
def main(number): def main(number):
try: try:
number = number.upper() number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN',
cookies={'locale': 'zh-cn'}) cookies={'locale': 'zh-cn'})
html = etree.fromstring(htmlcode, etree.HTMLParser())
dic = { dic = {
'actor': getActor(htmlcode), 'actor': getActor(html),
'title': getTitle(htmlcode), 'title': getTitle(html),
'studio': getStudio(htmlcode), 'studio': getStudio(html),
'outline': getOutline(htmlcode), 'outline': getOutline(html),
'runtime': '', 'runtime': '',
'director': getDirector(htmlcode), 'director': getDirector(html),
'release': getRelease(htmlcode), 'release': getRelease(html),
'number': number, 'number': number,
'cover': 'https:' + getCover(htmlcode), 'cover': 'https:' + getCover(html),
'cover_small': '', 'cover_small': '',
'imagecut': 0, 'imagecut': 0,
'tag': getTag(htmlcode), 'tag': getTag(html),
'label': getLabel(htmlcode), 'label': getLabel(html),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '', 'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html', 'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
'source': 'dlsite.py', 'source': 'dlsite.py',
'series': getSeries(htmlcode), 'series': getSeries(html),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
@@ -166,4 +157,6 @@ def main(number):
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__": if __name__ == "__main__":
config.getInstance().set_override("debug_mode:switch=1")
print(main('VJ013178')) print(main('VJ013178'))
print(main('RJ329607'))

View File

@@ -9,130 +9,33 @@ from urllib.parse import urlencode
from lxml import etree from lxml import etree
from ADC_function import * from ADC_function import *
from WebCrawler.crawler import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
class fanzaCrawler(Crawler):
def getFanzaString(self,string):
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getTitle(text): def getFanzaStrings(self, string):
html = etree.fromstring(text, etree.HTMLParser()) result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
result = html.xpath('//*[starts-with(@id, "title")]/text()')[0] if len(result1) > 0:
return result return result1
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2
def getActor(text): def getRelease(fanza_Crawler):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() result = fanza_Crawler.getFanzaString('発売日:')
html = etree.fromstring(text, etree.HTMLParser()) if result == '----':
result = ( result = fanza_Crawler.getFanzaString('配信開始日:')
str( return result.replace("/", "-").strip('\\n')
html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
)
)
.strip(" ['']")
.replace("', '", ",")
)
return result
def getStudio(text): def getCover(html, number):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
)[0]
return result
def getRuntime(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group()
def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'レーベル:')]/following-sibling::td/text()"
)[0]
return result
def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0]
return result
def getYear(getRelease):
try:
result = str(re.search(r"\d{4}", getRelease).group())
return result
except:
return getRelease
def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
except:
result = "----"
if result == "----":
try:
result = html.xpath(
"//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
try:
result = html.xpath(
"//td[contains(text(),'配信開始日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
except:
pass
return result.replace("/", "-")
def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
)
return result
except:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
)
return result
def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser())
cover_number = number cover_number = number
try: try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0] result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
@@ -151,29 +54,11 @@ def getCover(text, number):
return result return result
def getDirector(text): def getOutline(html):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath( result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0]
return result
def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser())
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", ""
)
if result == "": if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace( result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
"\n", ""
)
except: except:
# (TODO) handle more edge case # (TODO) handle more edge case
# print(html) # print(html)
@@ -181,23 +66,8 @@ def getOutline(text):
return result return result
def getSeries(text):
try:
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0]
return result
except:
return ""
def getExtrafanart(htmlcode): # 获取剧照 def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>') html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div></div>')
html = html_pather.search(htmlcode) html = html_pather.search(htmlcode)
if html: if html:
html = html.group() html = html.group()
@@ -232,6 +102,7 @@ def main(number):
"https://www.dmm.co.jp/rental/-/detail/=/cid=", "https://www.dmm.co.jp/rental/-/detail/=/cid=",
] ]
chosen_url = "" chosen_url = ""
fanza_Crawler = ''
for url in fanza_urls: for url in fanza_urls:
chosen_url = url + fanza_search_number chosen_url = url + fanza_search_number
@@ -240,6 +111,7 @@ def main(number):
urlencode({"rurl": chosen_url}) urlencode({"rurl": chosen_url})
) )
) )
fanza_Crawler = fanzaCrawler(htmlcode)
if "404 Not Found" not in htmlcode: if "404 Not Found" not in htmlcode:
break break
if "404 Not Found" in htmlcode: if "404 Not Found" in htmlcode:
@@ -249,28 +121,34 @@ def main(number):
# for example, the url will be cid=test012 # for example, the url will be cid=test012
# but the hinban on the page is test00012 # but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions # so get the hinban first, and then pass it to following functions
fanza_hinban = getNum(htmlcode) fanza_hinban = fanza_Crawler.getFanzaString('品番:')
out_num = fanza_hinban
number_lo = number.lower()
html = etree.fromstring(htmlcode, etree.HTMLParser())
if (re.sub('-|_', '', number_lo) == fanza_hinban or
number_lo.replace('-', '00') == fanza_hinban or
number_lo.replace('-', '') + 'so' == fanza_hinban
):
out_num = number
data = { data = {
"title": getTitle(htmlcode).strip(), "title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
"studio": getStudio(htmlcode), "studio": fanza_Crawler.getFanzaString('メーカー'),
"outline": getOutline(htmlcode), "outline": getOutline(html),
"runtime": getRuntime(htmlcode), "runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
"director": getDirector(htmlcode) if "anime" not in chosen_url else "", "director": fanza_Crawler.getFanzaString('監督:') if "anime" not in chosen_url else "",
"actor": getActor(htmlcode) if "anime" not in chosen_url else "", "actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "",
"release": getRelease(htmlcode), "release": getRelease(fanza_Crawler),
"number": fanza_hinban, "number": out_num,
"cover": getCover(htmlcode, fanza_hinban), "cover": getCover(html, fanza_hinban),
"imagecut": 1, "imagecut": 1,
"tag": getTag(htmlcode), "tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
"extrafanart": getExtrafanart(htmlcode), "extrafanart": getExtrafanart(htmlcode),
"label": getLabel(htmlcode), "label": fanza_Crawler.getFanzaString('レーベル'),
"year": getYear( "year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
getRelease(htmlcode)
), # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "", "actor_photo": "",
"website": chosen_url, "website": chosen_url,
"source": "fanza.py", "source": "fanza.py",
"series": getSeries(htmlcode), "series": fanza_Crawler.getFanzaString('シリーズ:'),
} }
except: except:
data = { data = {
@@ -314,4 +192,6 @@ def main_htmlcode(number):
if __name__ == "__main__": if __name__ == "__main__":
# print(main("DV-1562")) # print(main("DV-1562"))
# print(main("96fad1217")) # print(main("96fad1217"))
print(main("h_173ghmt68")) print(main("pred00251"))
print(main("MIAA-391"))
print(main("OBA-326"))

View File

@@ -4,58 +4,11 @@ import re
from lxml import etree#need install from lxml import etree#need install
import json import json
import ADC_function import ADC_function
from WebCrawler.crawler import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle_fc2com(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
return result
def getActor_fc2com(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
return result
except:
return ''
def getStudio_fc2com(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
return result
except:
return ''
def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result
def getRelease_fc2com(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()')).strip(" ['販売日 : ']").replace('/','-')
return result
def getCover_fc2com(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
return 'http:' + result
# def getOutline_fc2com(htmlcode2): #获取番号 #
# xpath_html = etree.fromstring(htmlcode2, etree.HTMLParser())
# path = str(xpath_html.xpath('//*[@id="top"]/div[1]/section[4]/iframe/@src')).strip(" ['']")
# html = etree.fromstring(ADC_function.get_html('https://adult.contents.fc2.com/'+path), etree.HTMLParser())
# print('https://adult.contents.fc2.com'+path)
# print(ADC_function.get_html('https://adult.contents.fc2.com'+path,cookies={'wei6H':'1'}))
# result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
# return result
def getTag_fc2com(lx):
result = lx.xpath("//a[@class='tag tagTag']/text()")
return result
def getYear_fc2com(release):
try:
result = re.search('\d{4}',release).group()
return result
except:
return ''
def getExtrafanart(htmlcode): # 获取剧照 def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>') html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(htmlcode) html = html_pather.search(htmlcode)
@@ -79,27 +32,30 @@ def getTrailer(htmlcode, number):
except: except:
return '' return ''
else: else:
video_url = '' return ''
def main(number): def main(number):
try: try:
number = number.replace('FC2-', '').replace('fc2-', '') number = number.replace('FC2-', '').replace('fc2-', '')
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/') htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
actor = getActor_fc2com(htmlcode2) fc2_crawler = Crawler(htmlcode2)
if not actor: actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
if actor == "":
actor = '素人' actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser()) lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']") cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover) cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
strip(" ['販売日 : ']").replace('/','-')
dic = { dic = {
'title': lx.xpath('/html/head/title/text()')[0], 'title': fc2_crawler.getString('/html/head/title/text()'),
'studio': getStudio_fc2com(htmlcode2), 'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'year': re.findall('\d{4}',release)[0],
'outline': '', # getOutline_fc2com(htmlcode2), 'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]), 'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
'director': getStudio_fc2com(htmlcode2), 'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'actor': actor, 'actor': actor,
'release': getRelease_fc2com(htmlcode2), 'release': release,
'number': 'FC2-' + number, 'number': 'FC2-' + number,
'label': '', 'label': '',
'cover': cover, 'cover': cover,
@@ -107,7 +63,7 @@ def main(number):
'extrafanart': getExtrafanart(htmlcode2), 'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2, number), "trailer": getTrailer(htmlcode2, number),
'imagecut': 0, 'imagecut': 0,
'tag': getTag_fc2com(lx), 'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
'actor_photo': '', 'actor_photo': '',
'website': 'https://adult.contents.fc2.com/article/' + number + '/', 'website': 'https://adult.contents.fc2.com/article/' + number + '/',
'source': 'https://adult.contents.fc2.com/article/' + number + '/', 'source': 'https://adult.contents.fc2.com/article/' + number + '/',
@@ -121,6 +77,4 @@ def main(number):
return js return js
if __name__ == '__main__': if __name__ == '__main__':
print(main('FC2-1787685')) print(main('FC2-2182382'))
print(main('FC2-2086710'))

88
WebCrawler/gcolle.py Normal file
View File

@@ -0,0 +1,88 @@
import sys
sys.path.append('../')
from WebCrawler.crawler import *
from ADC_function import *
from lxml import etree
def main(number):
save_cookies = False
cookie_filename = 'gcolle.json'
try:
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
session = get_html_session(cookies=gcolle_cooikes)
number = number.upper().replace('GCOLLE-','')
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
gcolle_crawler = Crawler(htmlcode)
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
if r18_continue and r18_continue.startswith('http'):
htmlcode = session.get(r18_continue).text
gcolle_crawler = Crawler(htmlcode)
save_cookies = True
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
if number != number_html:
raise Exception('[-]gcolle.py: number not match')
if save_cookies:
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
cookies_save.parent.mkdir(parents=True, exist_ok=True)
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
# get extrafanart url
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
else:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
dic = {
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
"runtime": '',
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"number": "GCOLLE-" + str(number_html),
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"trailer": '',
"actor_photo":'',
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
"extrafanart":extrafanart,
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
"source": 'gcolle.py',
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
'无码': False,
}
# for k,v in dic.items():
# if k == 'outline':
# print(k,len(v))
# else:
# print(k,v)
# print('===============================================================')
except Exception as e:
dic = {'title':''}
if config.getInstance().debug():
print(e)
return dic
if __name__ == '__main__':
from pprint import pprint
config.getInstance().set_override("debug_mode:switch=1")
pprint(main('840724'))
pprint(main('840386'))
pprint(main('838671'))
pprint(main('814179'))
pprint(main('834255'))
pprint(main('814179'))

View File

@@ -56,9 +56,9 @@ def parse_info(soup: BeautifulSoup) -> dict:
"label": get_label(data_dic), "label": get_label(data_dic),
"studio": get_studio(data_dic), "studio": get_studio(data_dic),
"tag": get_tag(data_dic), "tag": get_tag(data_dic),
"number": get_number(data_dic), "number": get_number(data_dic).upper(),
"release": get_release(data_dic), "release": get_release(data_dic),
"runtime": get_runtime(data_dic), "runtime": get_runtime(data_dic).replace(" minutes", ""),
"series": get_series(data_dic), "series": get_series(data_dic),
} }
else: else:

View File

@@ -60,10 +60,10 @@ def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string) result = re.sub('/.*?.jpg','',string)
return result return result
def getOutline(number, title): #获取剧情介绍 多进程并发查询 def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度 return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
return getStoryline(number,title) return getStoryline(number,title, 无码=uncensored)
def getSeriseJa(html): def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return str(x[0]) if len(x) else '' return str(x[0]) if len(x) else ''
@@ -83,9 +83,13 @@ def getExtrafanart(htmlcode): # 获取剧照
if extrafanart_imgs: if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs] return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return '' return ''
def getUncensored(html):
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
return bool(x)
def main_uncensored(number): def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/ja/' + number) w_number = number.replace('.', '-')
htmlcode = get_html('https://www.javbus.red/' + w_number)
if "<title>404 Page Not Found" in htmlcode: if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found') raise Exception('404 page not found')
lx = etree.fromstring(htmlcode, etree.HTMLParser()) lx = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -94,7 +98,7 @@ def main_uncensored(number):
'title': title, 'title': title,
'studio': getStudioJa(lx), 'studio': getStudioJa(lx),
'year': getYear(lx), 'year': getYear(lx),
'outline': getOutline(number, title), 'outline': getOutline(w_number, title, True),
'runtime': getRuntime(lx), 'runtime': getRuntime(lx),
'director': getDirectorJa(lx), 'director': getDirectorJa(lx),
'actor': getActor(lx), 'actor': getActor(lx),
@@ -106,9 +110,10 @@ def main_uncensored(number):
'label': getSeriseJa(lx), 'label': getSeriseJa(lx),
'imagecut': 0, 'imagecut': 0,
# 'actor_photo': '', # 'actor_photo': '',
'website': 'https://www.javbus.com/ja/' + number, 'website': 'https://www.javbus.red/' + w_number,
'source': 'javbus.py', 'source': 'javbus.py',
'series': getSeriseJa(lx), 'series': getSeriseJa(lx),
'无码': True
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
@@ -136,7 +141,7 @@ def main(number):
'title': title, 'title': title,
'studio': getStudio(lx), 'studio': getStudio(lx),
'year': getYear(lx), 'year': getYear(lx),
'outline': getOutline(number, title), 'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx), 'runtime': getRuntime(lx),
'director': getDirector(lx), 'director': getDirector(lx),
'actor': getActor(lx), 'actor': getActor(lx),
@@ -151,6 +156,7 @@ def main(number):
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py', 'source': 'javbus.py',
'series': getSerise(lx), 'series': getSerise(lx),
'无码': getUncensored(lx)
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js return js
@@ -168,13 +174,14 @@ def main(number):
return js return js
if __name__ == "__main__" : if __name__ == "__main__" :
config.G_conf_override['debug_mode:switch'] = True config.getInstance().set_override("debug_mode:switch=1")
print(main('ABP-888')) # print(main('ABP-888'))
print(main('ABP-960')) # print(main('ABP-960'))
print(main('ADV-R0624')) # 404 # print(main('ADV-R0624')) # 404
print(main('MMNT-010')) # print(main('MMNT-010'))
print(main('ipx-292')) # print(main('ipx-292'))
print(main('CEMD-011')) # print(main('CEMD-011'))
print(main('CJOD-278')) # print(main('CJOD-278'))
print(main('BrazzersExxtra.21.02.01'))
print(main('100221_001')) print(main('100221_001'))
print(main('AVSW-061')) print(main('AVSW-061'))

View File

@@ -166,12 +166,23 @@ def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number, title): #获取剧情介绍 多进程并发查询 def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
return getStoryline(number,title) return getStoryline(number, title, 无码=uncensored)
def getSeries(html): def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getUserRating(html):
try:
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0]), int(v[0][1])
except:
return
def getUncensored(html):
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
' or contains(@href,"/tags/western?")]')
return bool(x)
def main(number): def main(number):
# javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点 # javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
@@ -276,7 +287,7 @@ def main(number):
'actor': getActor(lx), 'actor': getActor(lx),
'title': title, 'title': title,
'studio': getStudio(detail_page, lx), 'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title), 'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx), 'runtime': getRuntime(lx),
'director': getDirector(lx), 'director': getDirector(lx),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
@@ -293,8 +304,12 @@ def main(number):
'website': urljoin('https://javdb.com', correct_url), 'website': urljoin('https://javdb.com', correct_url),
'source': 'javdb.py', 'source': 'javdb.py',
'series': getSeries(lx), 'series': getSeries(lx),
'无码': getUncensored(lx)
} }
userrating = getUserRating(lx)
if isinstance(userrating, tuple) and len(userrating) == 2:
dic['用户评分'] = userrating[0]
dic['评分人数'] = userrating[1]
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人') dic['actor'].append('素人')
if not dic['series']: if not dic['series']:
@@ -313,18 +328,19 @@ def main(number):
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__": if __name__ == "__main__":
config.G_conf_override['debug_mode:switch'] = True config.getInstance().set_override("debug_mode:switch=1")
# print(main('blacked.20.05.30')) # print(main('blacked.20.05.30'))
# print(main('AGAV-042')) # print(main('AGAV-042'))
# print(main('BANK-022')) # print(main('BANK-022'))
# print(main('070116-197')) print(main('070116-197'))
# print(main('093021_539')) # 没有剧照 片商pacopacomama # print(main('093021_539')) # 没有剧照 片商pacopacomama
#print(main('FC2-2278260')) #print(main('FC2-2278260'))
# print(main('FC2-735670')) # print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found # print(main('FC2-1174949')) # not found
#print(main('MVSD-439')) #print(main('MVSD-439'))
# print(main('EHM0001')) # not found # print(main('EHM0001')) # not found
print(main('FC2-2314275')) #print(main('FC2-2314275'))
# print(main('EBOD-646')) # print(main('EBOD-646'))
# print(main('LOVE-262')) # print(main('LOVE-262'))
#print(main('ABP-890')) print(main('ABP-890'))
print(main('blacked.14.12.08'))

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup # need install from bs4 import BeautifulSoup # need install
from lxml import etree # need install from lxml import etree # need install
from pyquery import PyQuery as pq # need install from pyquery import PyQuery as pq # need install
@@ -5,24 +7,22 @@ from ADC_function import *
import json import json
import re import re
from lib2to3.pgen2 import parse from lib2to3.pgen2 import parse
import sys
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
sys.path.append('../')
def getActorPhoto(html): def getActorPhoto(html):
return '' return ''
def getTitle(html, number): # 获取标题 def getTitle(html): # 获取标题
title = str(html.xpath('//h1[@class="article-title"]/text()')[0]) # <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
try: # <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
result = str(re.split(r'[/||-]', title)[1]) # <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
return result.strip() # <title>TM0002-我的痴女女友-麻豆社</title>
except: browser_title = str(html.xpath("/html/head/title/text()")[0])
return title.replace(number.upper(), '').strip() title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getStudio(html): # 获取厂商 已修改 def getStudio(html): # 获取厂商 已修改
try: try:
@@ -61,7 +61,6 @@ def getNum(url, number): # 获取番号
filename = unquote(urlparse(url).path) filename = unquote(urlparse(url).path)
# 裁剪文件名 # 裁剪文件名
result = filename[1:-5].upper().strip() result = filename[1:-5].upper().strip()
print(result)
# 移除中文 # 移除中文
if result.upper() != number.upper(): if result.upper() != number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0] result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
@@ -83,13 +82,15 @@ def getSerise(html): # 获取系列 已修改
return '' return ''
def getTag(html): # 获取标签 def getTag(html, studio): # 获取标签
return html.xpath('//div[@class="article-tags"]/a/text()') x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
def getExtrafanart(html): # 获取剧照 def getExtrafanart(html): # 获取剧照
return '' return ''
def cutTags(tags): def cutTags(tags):
actors = [] actors = []
tags = [] tags = []
@@ -109,13 +110,15 @@ def main(number):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
url = getUrl(html) url = getUrl(html)
tags = getTag(html) studio = getStudio(html)
actor,tags = cutTags(tags); tags = getTag(html, studio)
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定放弃尝试获取
actor = ''
dic = { dic = {
# 标题 # 标题
'title': getTitle(html, number), 'title': getTitle(html),
# 制作商 # 制作商
'studio': getStudio(html), 'studio': studio,
# 年份 # 年份
'year': getYear(html), 'year': getYear(html),
# 简介 # 简介
@@ -143,7 +146,8 @@ def main(number):
'website': url, 'website': url,
'source': 'madou.py', 'source': 'madou.py',
# 使用 # 使用
'series': getSerise(html) 'series': getSerise(html),
'无码': True
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
indent=4, separators=(',', ':'), ) # .encode('UTF-8') indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -161,4 +165,11 @@ def main(number):
if __name__ == '__main__': if __name__ == '__main__':
print(main('MD0094')) config.getInstance().set_override("debug_mode:switch=1")
print(main('MD0129'))
# print(main('TM0002'))
# print(main('MD0222'))
# print(main('MD0140-2'))
# print(main('MAD039'))
# print(main('JDMY027'))

View File

@@ -5,95 +5,28 @@ from lxml import etree
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
from WebCrawler.crawler import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): class MgsCrawler(Crawler):
try: def getMgsString(self, _xpath):
html = etree.fromstring(a, etree.HTMLParser()) html = self.html
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']") result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
return result.replace('/', ',') result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
except: return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+')
def getYear(getRelease):
try:
result = str(re.search('\d{4}',getRelease).group())
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace('/','-')
def getTag(a): def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
'\\n') result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="EnlargeImage"]/@href')).strip(" ['']")
# result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
return result
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getExtrafanart(htmlcode): # 获取剧照 def getExtrafanart(htmlcode2): # 获取剧照
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>') html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
html = html_pather.search(htmlcode) html = html_pather.search(htmlcode2)
if html: if html:
html = html.group() html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"') extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
@@ -104,36 +37,35 @@ def getExtrafanart(htmlcode): # 获取剧照
def main(number2): def main(number2):
number=number2.upper() number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'})) htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode2, 'lxml')
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','') b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode = MgsCrawler(htmlcode2)
a = MgsCrawler(a2)
b = MgsCrawler(b2)
#print(b) #print(b)
try: dic = {
dic = { 'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
'title': getTitle(htmlcode).replace("\\n", '').replace(' ', ''), 'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
'studio': getStudio(a), 'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
'outline': getOutline(b), 'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
'runtime': getRuntime(a), 'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
'director': getDirector(a), 'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
'actor': getActor(a), 'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
'release': getRelease(a), 'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
'number': getNum(a), 'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
'cover': getCover(htmlcode), 'imagecut': 1,
'imagecut': 1, 'tag': getTag(a2),
'tag': getTag(a), 'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
'label': getLabel(a), 'extrafanart': getExtrafanart(htmlcode2),
'extrafanart': getExtrafanart(htmlcode), 'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '', 'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/', 'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py', 'source': 'mgstage.py',
'series': getSeries(a), 'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
} }
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js

View File

@@ -5,7 +5,6 @@ import json
import builtins import builtins
from ADC_function import * from ADC_function import *
from lxml.html import fromstring from lxml.html import fromstring
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher from difflib import SequenceMatcher
from unicodedata import category from unicodedata import category
@@ -13,7 +12,7 @@ from number_parser import is_uncensored
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"} G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池') G_mode_txt = ('顺序执行','线程池')
class noThread(object): class noThread(object):
def map(self, fn, param): def map(self, fn, param):
@@ -25,14 +24,15 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None): def getStoryline(number, title, sites: list=None, 无码=None):
start_time = time.time() start_time = time.time()
conf = config.getInstance() conf = config.getInstance()
if not conf.is_storyline(): if not conf.is_storyline():
return '' return ''
debug = conf.debug() or conf.storyline_show() == 2 debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites storyine_sites = conf.storyline_site().split(',') if sites is None else sites
if is_uncensored(number): unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
if unc:
storyine_sites += conf.storyline_uncensored_site().split(',') storyine_sites += conf.storyline_uncensored_site().split(',')
else: else:
storyine_sites += conf.storyline_censored_site().split(',') storyine_sites += conf.storyline_censored_site().split(',')
@@ -49,9 +49,8 @@ def getStoryline(number, title, sites: list=None):
cores = min(len(apply_sites), os.cpu_count()) cores = min(len(apply_sites), os.cpu_count())
if cores == 0: if cores == 0:
return '' return ''
run_mode = conf.storyline_mode() run_mode = 1 if conf.storyline_mode() > 0 else 0
assert run_mode in (0,1,2) with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args) results = pool.map(getStoryline_mp, mp_args)
sel = '' sel = ''
if not debug and conf.storyline_show() == 0: if not debug and conf.storyline_show() == 0:
@@ -62,7 +61,7 @@ def getStoryline(number, title, sites: list=None):
if not len(sel): if not len(sel):
sel = value sel = value
return sel return sel
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示 # 以下debug结果输出会写入日志
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
sel_site = '' sel_site = ''
for site, desc in zip(apply_sites, results): for site, desc in zip(apply_sites, results):
@@ -80,34 +79,33 @@ def getStoryline(number, title, sites: list=None):
def getStoryline_mp(args): def getStoryline_mp(args):
def _inner(site, number, title, debug): (site, number, title, debug) = args
start_time = time.time() start_time = time.time()
storyline = None storyline = None
if not isinstance(site, str): if not isinstance(site, str):
return storyline
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
# 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline return storyline
return _inner(*args) elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug): def getStoryline_airav(number, debug):
@@ -308,8 +306,8 @@ def getStoryline_amazon(q_title, number, debug):
res = session.get(urljoin(res.url, lks[0])) res = session.get(urljoin(res.url, lks[0]))
cookie = None cookie = None
lx = fromstring(res.text) lx = fromstring(res.text)
titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles): if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found") raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug) idx = amazon_select_one(titles, q_title, number, debug)
@@ -325,8 +323,9 @@ def getStoryline_amazon(q_title, number, debug):
res = session.get(urljoin(res.url, lks[0])) res = session.get(urljoin(res.url, lks[0]))
cookie = None cookie = None
lx = fromstring(res.text) lx = fromstring(res.text)
div = lx.xpath('//*[@id="productDescription"]')[0] p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)]) p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
ama_t = ' '.join(p1) + ' '.join(p2)
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip() ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
if cookie is None: if cookie is None:
@@ -406,10 +405,10 @@ def amazon_select_one(a_titles, q_title, number, debug):
# debug 模式下记录识别准确率日志 # debug 模式下记录识别准确率日志
if ratio < 0.9: if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志 # 相似度[0.5, 0.9)的淘汰结果单独记录日志
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write( with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n') hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1 return -1
# 被采信的结果日志 # 被采信的结果日志
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write( with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n') hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel return sel

View File

@@ -128,7 +128,7 @@ def getOutline(html, number, title):
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字 a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a): if len(a):
site = [n for n in storyline_site if n in a] site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site) g = getStoryline(number, title, site, 无码=False)
if len(g): if len(g):
return g return g
try: try:

View File

@@ -1,119 +1,130 @@
# 详细教程请看 # 详细教程请看
# - https://github.com/yoshiko2/Movie_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini # - https://github.com/yoshiko2/Movie_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini
[common] [common]
main_mode=1 main_mode=1
source_folder=./ source_folder=./
failed_output_folder=failed failed_output_folder=failed
success_output_folder=JAV_output success_output_folder=JAV_output
soft_link=0 link_mode=0
failed_move=1 ; 0: 不刮削硬链接文件 1: 刮削硬链接文件
auto_exit=0 scan_hardlink=0
translate_to_sc=0 failed_move=0
multi_threading=0 auto_exit=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) translate_to_sc=0
actor_gender=female multi_threading=0
del_empty_folder=1 ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
; 跳过最近(默认:30)天新修改过的.NFO可避免整理模式(main_mode=3)和软连接(soft_link=0)时 actor_gender=female
; 反复刮削靠前的视频文件0为处理所有视频文件 del_empty_folder=1
nfo_skip_days=30 ; 跳过最近(默认:30)天新修改过的.NFO可避免整理模式(main_mode=3)和软连接(soft_link=0)时
; 处理完多少个视频文件后停止0为处理所有视频文件 ; 反复刮削靠前的视频文件0为处理所有视频文件
stop_counter=0 nfo_skip_days=30
; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁 ; 处理完多少个视频文件后停止0为处理所有视频文件
ignore_failed_list=0 stop_counter=0
download_only_missing_images=1 ; 再运行延迟时间单位h时m分s秒 举例: 1h30m45s(1小时30分45秒) 45(45秒)
mapping_table_validity=7 ; stop_counter不为零的条件下才有效每处理stop_counter部影片后延迟rerun_delay秒再次运行
rerun_delay=0
[proxy] ; 以上三个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
;proxytype: http or socks5 or socks5h switch: 0 1 ignore_failed_list=0
switch=0 download_only_missing_images=1
type=socks5 mapping_table_validity=7
proxy=127.0.0.1:1080
timeout=10 [proxy]
retry=3 ;proxytype: http or socks5 or socks5h switch: 0 1
cacert_file= switch=0
type=socks5
[Name_Rule] proxy=127.0.0.1:1080
location_rule=actor+'/'+number timeout=10
naming_rule=number+'-'+title retry=3
max_title_len=50 cacert_file=
[update] [Name_Rule]
update_check=1 location_rule=actor+'/'+number
naming_rule=number+'-'+title
[priority] max_title_len=50
website=javbus,airav,fanza,xcity,javdb,mgstage,fc2,avsox,dlsite,carib,fc2club
[update]
[escape] update_check=1
literals=\()/
folders=failed,JAV_output [priority]
website=javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,fc2club,madou,mv91,javdb,gcolle
[debug_mode]
switch=0 [escape]
literals=\()/
; 机器翻译 folders=failed,JAV_output
[translate]
switch=0 [debug_mode]
;可选项 google-free,azure switch=0
engine=google-free
; azure翻译密钥 ; 机器翻译
key= [translate]
; 翻译延迟 switch=0
delay=1 ;可选项 google-free,azure
values=title,outline engine=google-free
service_site=translate.google.cn ; azure翻译密钥
key=
; 预告片 ; 翻译延迟
[trailer] delay=1
switch=0 values=title,outline
service_site=translate.google.cn
; 用来确定是否是无码
[uncensored] ; 预告片
uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,MMDV [trailer]
switch=0
[media]
; 影片后缀 ; 用来确定是否是无码
media_type=.mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.iso,.mpg,.m4v [uncensored]
; 字幕后缀 uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,MMDV
sub_type=.smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.txt,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml
[media]
; 水印 ; 影片后缀
[watermark] media_type=.mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.iso,.mpg,.m4v
switch=1 ; 字幕后缀
water=2 sub_type=.smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml
; 左上 0, 右上 1, 右下 2 左下 3
; 水印
; 剧照 [watermark]
[extrafanart] switch=1
switch=1 water=2
parallel_download=5 ; 左上 0, 右上 1, 右下 2 左下 3
extrafanart_folder=extrafanart
; 剧照
; 剧情简介 [extrafanart]
[storyline] switch=1
switch=1 parallel_download=5
; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的 extrafanart_folder=extrafanart
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
; 其中airavwiki airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1 airavwiki 有码无码都能查, ; 剧情简介
; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。 [storyline]
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, switch=1
; 设置成不查询可大幅提高刮削速度。 ; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; site= ; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
site=1:avno1,4:airavwiki ; 其中airavwiki airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1 airavwiki 有码无码都能查,
censored_site=2:airav,5:xcity,6:amazon ; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。
uncensored_site=3:58avgo ; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) ; 设置成不查询可大幅提高刮削速度。
run_mode=1 ; site=
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因 site=1:avno1,4:airavwiki
show_result=0 censored_site=2:airav,5:xcity,6:amazon
uncensored_site=3:58avgo
; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁 ; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
[cc_convert] run_mode=1
mode=1 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
vars=outline,series,studio,tag,title show_result=0
[javdb] ; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁
sites=33,34 [cc_convert]
mode=1
; 人脸识别 hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确需要GPU/CUDA,速度慢) vars=outline,series,studio,tag,title
[face]
locations_model=hog [javdb]
sites=38,39
; 人脸识别 locations_model=hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确需要GPU/CUDA,速度慢)
; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面,有码封面直接切右半部分
; aways_imagecut=0:按各网站默认行为 1:总是裁剪封面,开启此项将无视[common]download_only_missing_images=1总是覆盖封面
; 封面裁剪的宽高比可配置公式为aspect_ratio/3。默认aspect_ratio=2.12: 适配大部分有码影片封面前一版本默认为2/3即aspect_ratio=2
[face]
locations_model=hog
uncensored_only=1
aways_imagecut=0
aspect_ratio=2.12

226
config.py
View File

@@ -3,19 +3,14 @@ import re
import sys import sys
import configparser import configparser
import time import time
import typing
from pathlib import Path from pathlib import Path
G_conf_override = { G_conf_override = {
# index 0 save Config() first instance for quick access by using getInstance() # index 0 save Config() first instance for quick access by using getInstance()
0: None, 0: None,
# register override config items # register override config items
"common:main_mode": None, # no need anymore
"common:source_folder": None,
"common:auto_exit": None,
"common:nfo_skip_days": None,
"common:stop_counter": None,
"common:ignore_failed_list": None,
"debug_mode:switch": None
} }
@@ -74,17 +69,17 @@ class Config:
elif (Path(__file__).resolve().parent / 'config.ini').is_file(): elif (Path(__file__).resolve().parent / 'config.ini').is_file():
res_path = Path(__file__).resolve().parent / 'config.ini' res_path = Path(__file__).resolve().parent / 'config.ini'
if res_path is None: if res_path is None:
sys.exit(2) os._exit(2)
ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:") ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:")
if re.search('n', ins, re.I): if re.search('n', ins, re.I):
sys.exit(2) os._exit(2)
# 用户目录才确定具有写权限,因此选择 ~/mdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的 # 用户目录才确定具有写权限,因此选择 ~/mdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的
# 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。 # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。
write_path = path_search_order[2] # Path.home() / "mdc.ini" write_path = path_search_order[2] # Path.home() / "mdc.ini"
write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8') write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8')
print("Config file '{}' created.".format(write_path.resolve())) print("Config file '{}' created.".format(write_path.resolve()))
input("Press Enter key exit...") input("Press Enter key exit...")
sys.exit(0) os._exit(0)
# self.conf = self._default_config() # self.conf = self._default_config()
# try: # try:
# self.conf = configparser.ConfigParser() # self.conf = configparser.ConfigParser()
@@ -95,29 +90,86 @@ class Config:
# except Exception as e: # except Exception as e:
# print("[-]Config file not found! Use the default settings") # print("[-]Config file not found! Use the default settings")
# print("[-]",e) # print("[-]",e)
# sys.exit(3) # os._exit(3)
# #self.conf = self._default_config() # #self.conf = self._default_config()
def getboolean_override(self, section, item) -> bool: def set_override(self, option_cmd: str):
return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool( """
G_conf_override[f"{section}:{item}"]) 通用的参数覆盖选项 -C 配置覆盖串
配置覆盖串语法:小节名:键名=值[;[小节名:]键名=值][;[小节名:]键名+=值] 多个键用分号分隔 名称可省略部分尾部字符
或 小节名:键名+=值[;[小节名:]键名=值][;[小节名:]键名+=值] 在已有值的末尾追加内容,多个键的=和+=可以交叉出现
例子: face:aspect_ratio=2;aways_imagecut=1;priority:website=javdb
小节名必须出现在开头至少一次,分号后可只出现键名=值,不再出现小节名,如果后续全部键名都属于同一个小节
例如配置文件存在两个小节[proxy][priority]那么pro可指代proxypri可指代priority
[face] ;face小节下方有4个键名locations_model= uncensored_only= aways_imagecut= aspect_ratio=
l,lo,loc,loca,locat,locati...直到locations_model完整名称都可以用来指代locations_model=键名
u,un,unc...直到uncensored_only完整名称都可以用来指代uncensored_only=键名
aw,awa...直到aways_imagecut完整名称都可以用来指代aways_imagecut=键名
as,asp...aspect_ratio完整名称都可以用来指代aspect_ratio=键名
a则因为二义性不是合法的省略键名
"""
def err_exit(str):
print(str)
os._exit(2)
def getint_override(self, section, item) -> int: sections = self.conf.sections()
return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int( sec_name = None
G_conf_override[f"{section}:{item}"]) for cmd in option_cmd.split(';'):
syntax_err = True
def get_override(self, section, item) -> str: rex = re.findall(r'^(.*?):(.*?)(=|\+=)(.*)$', cmd, re.U)
return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str( if len(rex) and len(rex[0]) == 4:
G_conf_override[f"{section}:{item}"]) (sec, key, assign, val) = rex[0]
sec_lo = sec.lower().strip()
key_lo = key.lower().strip()
syntax_err = False
elif sec_name: # 已经出现过一次小节名,属于同一个小节的后续键名可以省略小节名
rex = re.findall(r'^(.*?)(=|\+=)(.*)$', cmd, re.U)
if len(rex) and len(rex[0]) == 3:
(key, assign, val) = rex[0]
sec_lo = sec_name.lower()
key_lo = key.lower().strip()
syntax_err = False
if syntax_err:
err_exit(f"[-]Config override syntax incorrect. example: 'd:s=1' or 'debug_mode:switch=1'. cmd='{cmd}' all='{option_cmd}'")
if not len(sec_lo):
err_exit(f"[-]Config override Section name '{sec}' is empty! cmd='{cmd}'")
if not len(key_lo):
err_exit(f"[-]Config override Key name '{key}' is empty! cmd='{cmd}'")
if not len(val.strip()):
print(f"[!]Conig overide value '{val}' is empty! cmd='{cmd}'")
sec_name = None
for s in sections:
if not s.lower().startswith(sec_lo):
continue
if sec_name:
err_exit(f"[-]Conig overide Section short name '{sec_lo}' is not unique! dup1='{sec_name}' dup2='{s}' cmd='{cmd}'")
sec_name = s
if sec_name is None:
err_exit(f"[-]Conig overide Section name '{sec}' not found! cmd='{cmd}'")
key_name = None
keys = self.conf[sec_name]
for k in keys:
if not k.lower().startswith(key_lo):
continue
if key_name:
err_exit(f"[-]Conig overide Key short name '{key_lo}' is not unique! dup1='{key_name}' dup2='{k}' cmd='{cmd}'")
key_name = k
if key_name is None:
err_exit(f"[-]Conig overide Key name '{key}' not found! cmd='{cmd}'")
if assign == "+=":
val = keys[key_name] + val
if self.debug():
print(f"[!]Set config override [{sec_name}]{key_name}={val} by cmd='{cmd}'")
self.conf.set(sec_name, key_name, val)
def main_mode(self) -> int: def main_mode(self) -> int:
try: try:
return self.getint_override("common", "main_mode") return self.conf.getint("common", "main_mode")
except ValueError: except ValueError:
self._exit("common:main_mode") self._exit("common:main_mode")
def source_folder(self) -> str: def source_folder(self) -> str:
return self.get_override("common", "source_folder") return self.conf.get("common", "source_folder")
def failed_folder(self) -> str: def failed_folder(self) -> str:
return self.conf.get("common", "failed_output_folder") return self.conf.get("common", "failed_output_folder")
@@ -128,14 +180,17 @@ class Config:
def actor_gender(self) -> str: def actor_gender(self) -> str:
return self.conf.get("common", "actor_gender") return self.conf.get("common", "actor_gender")
def soft_link(self) -> bool: def link_mode(self) -> int:
return self.conf.getboolean("common", "soft_link") return self.conf.getint("common", "link_mode")
def scan_hardlink(self) -> bool:
return self.conf.getboolean("common", "scan_hardlink", fallback=False)#未找到配置选项,默认不刮削
def failed_move(self) -> bool: def failed_move(self) -> bool:
return self.conf.getboolean("common", "failed_move") return self.conf.getboolean("common", "failed_move")
def auto_exit(self) -> bool: def auto_exit(self) -> bool:
return self.getboolean_override("common", "auto_exit") return self.conf.getboolean("common", "auto_exit")
def translate_to_sc(self) -> bool: def translate_to_sc(self) -> bool:
return self.conf.getboolean("common", "translate_to_sc") return self.conf.getboolean("common", "translate_to_sc")
@@ -147,19 +202,13 @@ class Config:
return self.conf.getboolean("common", "del_empty_folder") return self.conf.getboolean("common", "del_empty_folder")
def nfo_skip_days(self) -> int: def nfo_skip_days(self) -> int:
try: return self.conf.getint("common", "nfo_skip_days", fallback=30)
return self.getint_override("common", "nfo_skip_days")
except:
return 30
def stop_counter(self) -> int: def stop_counter(self) -> int:
try: return self.conf.getint("common", "stop_counter", fallback=0)
return self.getint_override("common", "stop_counter")
except:
return 0
def ignore_failed_list(self) -> bool: def ignore_failed_list(self) -> bool:
return self.getboolean_override("common", "ignore_failed_list") return self.conf.getboolean("common", "ignore_failed_list")
def download_only_missing_images(self) -> bool: def download_only_missing_images(self) -> bool:
return self.conf.getboolean("common", "download_only_missing_images") return self.conf.getboolean("common", "download_only_missing_images")
@@ -167,6 +216,18 @@ class Config:
def mapping_table_validity(self) -> int: def mapping_table_validity(self) -> int:
return self.conf.getint("common", "mapping_table_validity") return self.conf.getint("common", "mapping_table_validity")
def rerun_delay(self) -> int:
value = self.conf.get("common", "rerun_delay")
if not (isinstance(value, str) and re.match(r'^[\dsmh]+$', value, re.I)):
return 0 # not match '1h30m45s' or '30' or '1s2m1h4s5m'
if value.isnumeric() and int(value) >= 0:
return int(value)
sec = 0
sec += sum(int(v) for v in re.findall(r'(\d+)s', value, re.I))
sec += sum(int(v) for v in re.findall(r'(\d+)m', value, re.I)) * 60
sec += sum(int(v) for v in re.findall(r'(\d+)h', value, re.I)) * 3600
return sec
def is_translate(self) -> bool: def is_translate(self) -> bool:
return self.conf.getboolean("translate", "switch") return self.conf.getboolean("translate", "switch")
@@ -243,8 +304,8 @@ class Config:
def media_type(self) -> str: def media_type(self) -> str:
return self.conf.get('media', 'media_type') return self.conf.get('media', 'media_type')
def sub_rule(self): def sub_rule(self) -> typing.Set[str]:
return self.conf.get('media', 'sub_type').split(',') return set(self.conf.get('media', 'sub_type').lower().split(','))
def naming_rule(self) -> str: def naming_rule(self) -> str:
return self.conf.get("Name_Rule", "naming_rule") return self.conf.get("Name_Rule", "naming_rule")
@@ -277,7 +338,7 @@ class Config:
return self.conf.get("escape", "folders") return self.conf.get("escape", "folders")
def debug(self) -> bool: def debug(self) -> bool:
return self.getboolean_override("debug_mode", "switch") return self.conf.getboolean("debug_mode", "switch")
def is_storyline(self) -> bool: def is_storyline(self) -> bool:
try: try:
@@ -304,43 +365,34 @@ class Config:
return "3:58avgo" return "3:58avgo"
def storyline_show(self) -> int: def storyline_show(self) -> int:
try: v = self.conf.getint("storyline", "show_result", fallback=0)
v = self.conf.getint("storyline", "show_result") return v if v in (0, 1, 2) else 2 if v > 2 else 0
return v if v in (0, 1, 2) else 2 if v > 2 else 0
except:
return 0
def storyline_mode(self) -> int: def storyline_mode(self) -> int:
try: return 1 if self.conf.getint("storyline", "run_mode", fallback=1) > 0 else 0
v = self.conf.getint("storyline", "run_mode")
return v if v in (0, 1, 2) else 2 if v > 2 else 0
except:
return 1
def cc_convert_mode(self) -> int: def cc_convert_mode(self) -> int:
try: v = self.conf.getint("cc_convert", "mode", fallback=1)
v = self.conf.getint("cc_convert", "mode") return v if v in (0, 1, 2) else 2 if v > 2 else 0
return v if v in (0, 1, 2) else 2 if v > 2 else 0
except:
return 1
def cc_convert_vars(self) -> str: def cc_convert_vars(self) -> str:
try: return self.conf.get("cc_convert", "vars",
return self.conf.get("cc_convert", "vars") fallback="actor,director,label,outline,series,studio,tag,title")
except:
return "actor,director,label,outline,series,studio,tag,title"
def javdb_sites(self) -> str: def javdb_sites(self) -> str:
try: return self.conf.get("javdb", "sites", fallback="38,39")
return self.conf.get("javdb", "sites")
except:
return "33,34"
def face_locations_model(self) -> str: def face_locations_model(self) -> str:
try: return self.conf.get("face", "locations_model", fallback="hog")
return self.conf.get("face", "locations_model")
except: def face_uncensored_only(self) -> bool:
return "hog" return self.conf.getboolean("face", "uncensored_only", fallback=True)
def face_aways_imagecut(self) -> bool:
return self.conf.getboolean("face", "aways_imagecut", fallback=False)
def face_aspect_ratio(self) -> float:
return self.conf.getfloat("face", "aspect_ratio", fallback=2.12)
@staticmethod @staticmethod
def _exit(sec: str) -> None: def _exit(sec: str) -> None:
@@ -358,7 +410,8 @@ class Config:
conf.set(sec1, "source_folder", "./") conf.set(sec1, "source_folder", "./")
conf.set(sec1, "failed_output_folder", "failed") conf.set(sec1, "failed_output_folder", "failed")
conf.set(sec1, "success_output_folder", "JAV_output") conf.set(sec1, "success_output_folder", "JAV_output")
conf.set(sec1, "soft_link", "0") conf.set(sec1, "link_mode", "0")
conf.set(sec1, "scan_hardlink", "0")
conf.set(sec1, "failed_move", "1") conf.set(sec1, "failed_move", "1")
conf.set(sec1, "auto_exit", "0") conf.set(sec1, "auto_exit", "0")
conf.set(sec1, "translate_to_sc", "1") conf.set(sec1, "translate_to_sc", "1")
@@ -370,6 +423,7 @@ class Config:
conf.set(sec1, "ignore_failed_list", 0) conf.set(sec1, "ignore_failed_list", 0)
conf.set(sec1, "download_only_missing_images", 1) conf.set(sec1, "download_only_missing_images", 1)
conf.set(sec1, "mapping_table_validity", 7) conf.set(sec1, "mapping_table_validity", 7)
conf.set(sec1, "rerun_delay", 0)
sec2 = "proxy" sec2 = "proxy"
conf.add_section(sec2) conf.add_section(sec2)
@@ -423,9 +477,9 @@ class Config:
sec11 = "media" sec11 = "media"
conf.add_section(sec11) conf.add_section(sec11)
conf.set(sec11, "media_type", conf.set(sec11, "media_type",
".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.MP4,.AVI,.RMVB,.WMV,.MOV,.MKV,.FLV,.TS,.WEBM,iso,ISO") ".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,iso")
conf.set(sec11, "sub_type", conf.set(sec11, "sub_type",
".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.txt,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml") ".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml")
sec12 = "watermark" sec12 = "watermark"
conf.add_section(sec12) conf.add_section(sec12)
@@ -503,8 +557,7 @@ if __name__ == "__main__":
config = Config() config = Config()
mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', mfilter = {'conf', 'proxy', '_exit', '_default_config', 'ini_path', 'set_override'}
'ini_path'}
for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]: for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
evprint(f'config.{_m}()') evprint(f'config.{_m}()')
pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'} pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}
@@ -513,36 +566,13 @@ if __name__ == "__main__":
for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]: for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:
evprint(f'getInstance().proxy().{_p}') evprint(f'getInstance().proxy().{_p}')
# Override Test
G_conf_override["common:nfo_skip_days"] = 4321
G_conf_override["common:stop_counter"] = 1234
assert config.nfo_skip_days() == 4321
assert getInstance().stop_counter() == 1234
# remove override
G_conf_override["common:stop_counter"] = None
G_conf_override["common:nfo_skip_days"] = None
assert config.nfo_skip_days() != 4321
assert config.stop_counter() != 1234
# Create new instance # Create new instance
conf2 = Config() conf2 = Config()
assert getInstance() != conf2 assert getInstance() != conf2
assert getInstance() == config assert getInstance() == config
G_conf_override["common:main_mode"] = 9
G_conf_override["common:source_folder"] = "A:/b/c" conf2.set_override("d:s=1;face:asp=2;f:aw=0;pri:w=javdb;f:l=")
# Override effect to all instances assert conf2.face_aspect_ratio() == 2
assert config.main_mode() == 9 assert conf2.face_aways_imagecut() == False
assert conf2.main_mode() == 9 assert conf2.sources() == "javdb"
assert getInstance().main_mode() == 9
assert conf2.source_folder() == "A:/b/c"
print("### Override Test ###".center(36))
evprint('getInstance().main_mode()')
evprint('config.source_folder()')
G_conf_override["common:main_mode"] = None
evprint('conf2.main_mode()')
evprint('config.main_mode()')
# unregister key acess will raise except
try:
print(G_conf_override["common:actor_gender"])
except KeyError as ke:
print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr)
print(f"Load Config file '{conf2.ini_path}'.") print(f"Load Config file '{conf2.ini_path}'.")

288
core.py
View File

@@ -1,5 +1,6 @@
import json import json
import os.path import os.path
import os
import pathlib import pathlib
import re import re
import shutil import shutil
@@ -10,6 +11,7 @@ from PIL import Image
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from lxml import etree
from ADC_function import * from ADC_function import *
from WebCrawler import get_data_from_json from WebCrawler import get_data_from_json
@@ -27,15 +29,15 @@ def escape_path(path, escape_literals: str): # Remove escape literals
def moveFailedFolder(filepath): def moveFailedFolder(filepath):
conf = config.getInstance() conf = config.getInstance()
failed_folder = conf.failed_folder() failed_folder = conf.failed_folder()
soft_link = conf.soft_link() link_mode = conf.link_mode()
# 模式3或软连接改为维护一个失败列表启动扫描时加载用于排除该路径以免反复处理 # 模式3或软连接改为维护一个失败列表启动扫描时加载用于排除该路径以免反复处理
# 原先的创建软连接到失败目录,并不直观,不方便找到失败文件位置,不如直接记录该文件路径 # 原先的创建软连接到失败目录,并不直观,不方便找到失败文件位置,不如直接记录该文件路径
if conf.main_mode() == 3 or soft_link: if conf.main_mode() == 3 or link_mode:
ftxt = os.path.abspath(os.path.join(failed_folder, 'failed_list.txt')) ftxt = os.path.abspath(os.path.join(failed_folder, 'failed_list.txt'))
print("[-]Add to Failed List file, see '%s'" % ftxt) print("[-]Add to Failed List file, see '%s'" % ftxt)
with open(ftxt, 'a', encoding='utf-8') as flt: with open(ftxt, 'a', encoding='utf-8') as flt:
flt.write(f'{filepath}\n') flt.write(f'{filepath}\n')
elif conf.failed_move() and not soft_link: elif conf.failed_move() and not link_mode:
failed_name = os.path.join(failed_folder, os.path.basename(filepath)) failed_name = os.path.join(failed_folder, os.path.basename(filepath))
mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt')) mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt'))
print("'[-]Move to Failed output folder, see '%s'" % mtxt) print("'[-]Move to Failed output folder, see '%s'" % mtxt)
@@ -69,10 +71,12 @@ def get_info(json_data): # 返回json里的数据
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label
def small_cover_check(path, number, cover_small, leak_word, c_word, hack_word, filepath): def small_cover_check(path, filename, cover_small, movie_path):
filename = f"{number}{leak_word}{c_word}{hack_word}-poster.jpg" full_filepath = Path(path) / filename
download_file_with_filename(cover_small, filename, path, filepath) if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(str(full_filepath)):
print('[+]Image Downloaded! ' + os.path.join(path, filename)) return
download_file_with_filename(cover_small, filename, path, movie_path)
print('[+]Image Downloaded! ' + full_filepath.name)
def create_folder(json_data): # 创建文件夹 def create_folder(json_data): # 创建文件夹
@@ -101,7 +105,7 @@ def create_folder(json_data): # 创建文件夹
os.makedirs(path) os.makedirs(path)
except: except:
print(f"[-]Fatal error! Can not make folder '{path}'") print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0) os._exit(0)
return os.path.normpath(path) return os.path.normpath(path)
@@ -121,7 +125,7 @@ def download_file_with_filename(url, filename, path, filepath):
os.makedirs(path) os.makedirs(path)
except: except:
print(f"[-]Fatal error! Can not make folder '{path}'") print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0) os._exit(0)
proxies = configProxy.proxies() proxies = configProxy.proxies()
headers = { headers = {
'User-Agent': G_USER_AGENT} 'User-Agent': G_USER_AGENT}
@@ -138,7 +142,7 @@ def download_file_with_filename(url, filename, path, filepath):
os.makedirs(path) os.makedirs(path)
except: except:
print(f"[-]Fatal error! Can not make folder '{path}'") print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0) os._exit(0)
headers = { headers = {
'User-Agent': G_USER_AGENT} 'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers) r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -213,7 +217,7 @@ def extrafanart_download_one_by_one(data, path, filepath):
break break
if file_not_exist_or_empty(jpg_fullpath): if file_not_exist_or_empty(jpg_fullpath):
return return
print('[+]Image Downloaded!', jpg_fullpath) print('[+]Image Downloaded!', Path(jpg_fullpath).name)
j += 1 j += 1
if conf.debug(): if conf.debug():
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
@@ -244,7 +248,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
if failed: # 非致命错误电影不移入失败文件夹将来可以用模式3补齐 if failed: # 非致命错误电影不移入失败文件夹将来可以用模式3补齐
print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.") print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.")
else: else:
print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'") print(f"[+]Successfully downloaded {len(result)} extrafanarts.")
if conf.debug(): if conf.debug():
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
@@ -255,7 +259,7 @@ def image_ext(url):
return ".jpg" return ".jpg"
# 封面是否下载成功否则移动到failed # 封面是否下载成功否则移动到failed
def image_download(cover, fanart_path,thumb_path, path, filepath): def image_download(cover, fanart_path, thumb_path, path, filepath):
full_filepath = os.path.join(path, fanart_path) full_filepath = os.path.join(path, fanart_path)
if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath): if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
return return
@@ -273,7 +277,7 @@ def image_download(cover, fanart_path,thumb_path, path, filepath):
break break
if file_not_exist_or_empty(full_filepath): if file_not_exist_or_empty(full_filepath):
return return
print('[+]Image Downloaded!', full_filepath) print('[+]Image Downloaded!', Path(full_filepath).name)
shutil.copyfile(full_filepath, os.path.join(path, thumb_path)) shutil.copyfile(full_filepath, os.path.join(path, thumb_path))
@@ -289,8 +293,14 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
os.makedirs(path) os.makedirs(path)
except: except:
print(f"[-]Fatal error! can not make folder '{path}'") print(f"[-]Fatal error! can not make folder '{path}'")
sys.exit(0) os._exit(0)
old_nfo = None
try:
if os.path.isfile(nfo_path):
old_nfo = etree.parse(nfo_path)
except:
pass
# KODI内查看影片信息时找不到number配置naming_rule=number+'#'+title虽可解决 # KODI内查看影片信息时找不到number配置naming_rule=number+'#'+title虽可解决
# 但使得标题太长放入时常为空的outline内会更适合软件给outline留出的显示版面也较大 # 但使得标题太长放入时常为空的outline内会更适合软件给outline留出的显示版面也较大
outline = f"{number}#{outline}" outline = f"{number}#{outline}"
@@ -354,6 +364,41 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
print(" <premiered>" + release + "</premiered>", file=code) print(" <premiered>" + release + "</premiered>", file=code)
print(" <releasedate>" + release + "</releasedate>", file=code) print(" <releasedate>" + release + "</releasedate>", file=code)
print(" <release>" + release + "</release>", file=code) print(" <release>" + release + "</release>", file=code)
if old_nfo:
try:
xur = old_nfo.xpath('//userrating/text()')[0]
if isinstance(xur, str) and re.match('\d+\.\d+|\d+', xur.strip()):
print(f" <userrating>{xur.strip()}</userrating>", file=code)
except:
pass
try:
f_rating = json_data['用户评分']
uc = json_data['评分人数']
print(f""" <rating>{round(f_rating * 2.0, 1)}</rating>
<criticrating>{round(f_rating * 20.0, 1)}</criticrating>
<ratings>
<rating name="javdb" max="5" default="true">
<value>{f_rating}</value>
<votes>{uc}</votes>
</rating>
</ratings>""", file=code)
except:
if old_nfo:
try:
for rtag in ('rating', 'criticrating'):
xur = old_nfo.xpath(f'//{rtag}/text()')[0]
if isinstance(xur, str) and re.match('\d+\.\d+|\d+', xur.strip()):
print(f" <{rtag}>{xur.strip()}</{rtag}>", file=code)
f_rating = old_nfo.xpath(f"//ratings/rating[@name='javdb']/value/text()")[0]
uc = old_nfo.xpath(f"//ratings/rating[@name='javdb']/votes/text()")[0]
print(f""" <ratings>
<rating name="javdb" max="5" default="true">
<value>{f_rating}</value>
<votes>{uc}</votes>
</rating>
</ratings>""", file=code)
except:
pass
print(" <cover>" + cover + "</cover>", file=code) print(" <cover>" + cover + "</cover>", file=code)
if config.getInstance().is_trailer(): if config.getInstance().is_trailer():
print(" <trailer>" + trailer + "</trailer>", file=code) print(" <trailer>" + trailer + "</trailer>", file=code)
@@ -462,51 +507,51 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
# ========================结束================================= # ========================结束=================================
def paste_file_to_folder(filepath, path, number, leak_word, c_word, hack_word): # 文件路径,番号,后缀,要移动至的位置 def paste_file_to_folder(filepath, path, multi_part, number, part, leak_word, c_word, hack_word): # 文件路径,番号,后缀,要移动至的位置
filepath_obj = pathlib.Path(filepath) filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
try: try:
targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}{houzhui}") targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}{houzhui}")
# 任何情况下都不要覆盖以免遭遇数据源或者引擎错误导致所有文件得到同一个number逐一 # 任何情况下都不要覆盖以免遭遇数据源或者引擎错误导致所有文件得到同一个number逐一
# 同名覆盖致使全部文件损失且不可追回的最坏情况 # 同名覆盖致使全部文件损失且不可追回的最坏情况
if os.path.exists(targetpath): if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.') raise FileExistsError('File Exists on destination path, we will never overwriting.')
soft_link = config.getInstance().soft_link() link_mode = config.getInstance().link_mode()
# 如果soft_link=1 使用软链接 # 如果link_mode 1: 建立软链接 2: 硬链接优先、无法建立硬链接再尝试软链接
if soft_link == 0: # 移除原先soft_link=2的功能代码因默认记录日志已经可追溯文件来源
create_softlink = False
if link_mode not in (1, 2):
shutil.move(filepath, targetpath) shutil.move(filepath, targetpath)
elif soft_link == 1: elif link_mode == 2:
# 跨卷或跨盘符无法建立硬链接导致异常,回落到建立软链接
try:
os.link(filepath, targetpath, follow_symlinks=False)
except:
create_softlink = True
if link_mode == 1 or create_softlink:
# 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持 # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持
# 相对路径径,改用绝对路径方式尝试建立软链接 # 相对路径径,改用绝对路径方式尝试建立软链接
try: try:
filerelpath = os.path.relpath(filepath, path) filerelpath = os.path.relpath(filepath, path)
os.symlink(filerelpath, targetpath) os.symlink(filerelpath, targetpath)
except: except:
os.symlink(filepath_obj.resolve(), targetpath) os.symlink(str(filepath_obj.resolve()), targetpath)
elif soft_link == 2:
shutil.move(filepath, targetpath)
# 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置
# 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪
# 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改。
targetabspath = os.path.abspath(targetpath)
if targetabspath != os.path.abspath(filepath):
targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
os.symlink(targetrelpath, filepath)
sub_res = config.getInstance().sub_rule()
for subname in sub_res: sub_res = config.getInstance().sub_rule()
sub_filepath = str(filepath_obj.with_suffix(subname)) for subfile in filepath_obj.parent.glob('**/*'):
if os.path.isfile(sub_filepath.replace(subname,".chs" + subname)): if subfile.is_file() and subfile.suffix.lower() in sub_res:
sub_filepath = sub_filepath.replace(subname,".chs" + subname) if multi_part and part.lower() not in subfile.name.lower():
subname = ".chs" + subname continue
elif os.path.isfile(sub_filepath.replace(subname,".cht" + subname)): if filepath_obj.stem.split('.')[0].lower() != subfile.stem.split('.')[0].lower():
sub_filepath = sub_filepath.replace(subname, ".cht" + subname) continue
subname = ".cht" + subname sub_targetpath = Path(path) / f"{number}{leak_word}{c_word}{hack_word}{''.join(subfile.suffixes)}"
if os.path.isfile(sub_filepath): if link_mode not in (1, 2):
shutil.move(sub_filepath, os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}{subname}")) shutil.move(str(subfile), str(sub_targetpath))
print('[+]Sub moved!') print(f"[+]Sub Moved! {sub_targetpath.name}")
return True else:
shutil.copyfile(str(subfile), str(sub_targetpath))
print(f"[+]Sub Copied! {sub_targetpath.name}")
return
except FileExistsError as fee: except FileExistsError as fee:
print(f'[-]FileExistsError: {fee}') print(f'[-]FileExistsError: {fee}')
@@ -525,24 +570,39 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
number += part # 这时number会被附加上CD1后缀 number += part # 这时number会被附加上CD1后缀
filepath_obj = pathlib.Path(filepath) filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}{houzhui}") targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}{houzhui}")
if os.path.exists(targetpath): if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.') raise FileExistsError('File Exists on destination path, we will never overwriting.')
try: try:
if config.getInstance().soft_link(): link_mode = config.getInstance().link_mode()
os.symlink(filepath, targetpath) create_softlink = False
else: if link_mode not in (1, 2):
shutil.move(filepath, targetpath) shutil.move(filepath, targetpath)
elif link_mode == 2:
try:
os.link(filepath, targetpath, follow_symlinks=False)
except:
create_softlink = True
if link_mode == 1 or create_softlink:
try:
filerelpath = os.path.relpath(filepath, path)
os.symlink(filerelpath, targetpath)
except:
os.symlink(str(filepath_obj.resolve()), targetpath)
sub_res = config.getInstance().sub_rule() sub_res = config.getInstance().sub_rule()
for subname in sub_res: for subfile in filepath_obj.parent.glob('**/*'):
sub_filepath = str(filepath_obj.with_suffix(subname)) if subfile.is_file() and subfile.suffix.lower() in sub_res:
if os.path.isfile(sub_filepath): # 字幕移动 if multi_part and part.lower() not in subfile.name.lower():
shutil.move(sub_filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}{subname}")) continue
print('[+]Sub moved!') sub_targetpath = Path(path) / f"{number}{leak_word}{c_word}{hack_word}{''.join(subfile.suffixes)}"
print('[!]Success') if link_mode not in (1, 2):
return True shutil.move(str(subfile), str(sub_targetpath))
print(f"[+]Sub Moved! {sub_targetpath.name}")
else:
shutil.copyfile(str(subfile), str(sub_targetpath))
print(f"[+]Sub Copied! {sub_targetpath.name}")
return
except FileExistsError as fee: except FileExistsError as fee:
print(f'[-]FileExistsError: {fee}') print(f'[-]FileExistsError: {fee}')
return return
@@ -554,18 +614,6 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
return return
def get_part(filepath):
try:
if re.search('-CD\d+', filepath):
return re.findall('-CD\d+', filepath)[0]
if re.search('-cd\d+', filepath):
return re.findall('-cd\d+', filepath)[0]
except:
print("[-]failed!Please rename the filename again!")
moveFailedFolder(filepath)
return
def debug_print(data: json): def debug_print(data: json):
try: try:
print("[+] ------- DEBUG INFO -------") print("[+] ------- DEBUG INFO -------")
@@ -578,14 +626,65 @@ def debug_print(data: json):
if i == 'extrafanart': if i == 'extrafanart':
print('[+] -', "%-14s" % i, ':', len(v), 'links') print('[+] -', "%-14s" % i, ':', len(v), 'links')
continue continue
print('[+] -', "%-14s" % i, ':', v) print(f'[+] - {i:<{cnspace(i,14)}} : {v}')
print("[+] ------- DEBUG INFO -------") print("[+] ------- DEBUG INFO -------")
except: except:
pass pass
def core_main(file_path, number_th, oCC): def core_main_no_net_op(movie_path, number):
conf = config.getInstance()
part = ''
leak_word = ''
leak = 0
c_word = ''
cn_sub = ''
hack = ''
hack_word = ''
ext = '.jpg'
imagecut = 1
path = str(Path(movie_path).parent)
if re.search('[-_]CD\d+', movie_path, re.IGNORECASE):
part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper()
if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path,
re.I) or '中文' in movie_path or '字幕' in movie_path:
cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀
uncensored = 1 if is_uncensored(number) else 0
if '流出' in movie_path or 'uncensored' in movie_path.lower():
leak_word = '-流出' # 流出影片后缀
leak = 1
if 'hack'.upper() in str(movie_path).upper() or '破解' in movie_path:
hack = 1
hack_word = "-hack"
prestr = f"{number}{leak_word}{c_word}{hack_word}"
fanart_path = f"{prestr}-fanart{ext}"
poster_path = f"{prestr}-poster{ext}"
thumb_path = f"{prestr}-thumb{ext}"
full_fanart_path = os.path.join(path, fanart_path)
full_poster_path = os.path.join(path, poster_path)
full_thumb_path = os.path.join(path, thumb_path)
full_nfo = Path(path) / f"{prestr}{part}.nfo"
if full_nfo.is_file():
if full_nfo.read_text(encoding='utf-8').find(r'<tag>无码</tag>') >= 0:
uncensored = 1
else:
return
if not all(os.path.isfile(f) for f in (full_fanart_path, full_thumb_path)):
return
cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored))
if conf.is_watermark():
add_mark(full_poster_path, full_thumb_path, cn_sub, leak, uncensored, hack)
def core_main(movie_path, number_th, oCC):
conf = config.getInstance() conf = config.getInstance()
# =======================================================================初始化所需变量 # =======================================================================初始化所需变量
multi_part = 0 multi_part = 0
@@ -597,8 +696,6 @@ def core_main(file_path, number_th, oCC):
hack = '' hack = ''
hack_word = '' hack_word = ''
filepath = file_path # 影片的路径 绝对路径
# 下面被注释的变量不需要 # 下面被注释的变量不需要
#rootpath= os.getcwd #rootpath= os.getcwd
number = number_th number = number_th
@@ -606,7 +703,7 @@ def core_main(file_path, number_th, oCC):
# Return if blank dict returned (data not found) # Return if blank dict returned (data not found)
if not json_data: if not json_data:
moveFailedFolder(filepath) moveFailedFolder(movie_path)
return return
if json_data["number"] != number: if json_data["number"] != number:
@@ -619,25 +716,26 @@ def core_main(file_path, number_th, oCC):
imagecut = json_data.get('imagecut') imagecut = json_data.get('imagecut')
tag = json_data.get('tag') tag = json_data.get('tag')
# =======================================================================判断-C,-CD后缀 # =======================================================================判断-C,-CD后缀
if '-CD' in filepath or '-cd' in filepath: if re.search('[-_]CD\d+', movie_path, re.IGNORECASE):
multi_part = 1 multi_part = 1
part = get_part(filepath) part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper()
if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath: if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path,
re.I) or '中文' in movie_path or '字幕' in movie_path:
cn_sub = '1' cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀 c_word = '-C' # 中文字幕影片后缀
# 判断是否无码 # 判断是否无码
uncensored = 1 if is_uncensored(number) else 0 unce = json_data.get('无码')
uncensored = int(unce) if isinstance(unce, bool) else int(is_uncensored(number))
if '流出' in movie_path or 'uncensored' in movie_path.lower():
if '流出' in filepath or 'uncensored' in filepath:
liuchu = '流出' liuchu = '流出'
leak = 1 leak = 1
leak_word = '-流出' # 流出影片后缀 leak_word = '-流出' # 流出影片后缀
else: else:
leak = 0 leak = 0
if 'hack'.upper() in str(filepath).upper() or '破解' in filepath: if 'hack'.upper() in str(movie_path).upper() or '破解' in movie_path:
hack = 1 hack = 1
hack_word = "-hack" hack_word = "-hack"
@@ -666,78 +764,76 @@ def core_main(file_path, number_th, oCC):
# 检查小封面, 如果image cut为3则下载小封面 # 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3: if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, hack_word, filepath) small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path)
# creatFolder会返回番号路径 # creatFolder会返回番号路径
image_download( cover, fanart_path,thumb_path, path, filepath) image_download( cover, fanart_path,thumb_path, path, movie_path)
if not multi_part or part.lower() == '-cd1': if not multi_part or part.lower() == '-cd1':
try: try:
# 下载预告片 # 下载预告片
if conf.is_trailer() and json_data.get('trailer'): if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, filepath) trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, movie_path)
except: except:
pass pass
try: try:
# 下载剧照 data, path, filepath # 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'): if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, number, filepath) extrafanart_download(json_data.get('extrafanart'), path, number, movie_path)
except: except:
pass pass
# 裁剪图 # 裁剪图
cutImage(imagecut, path , fanart_path, poster_path) cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored))
# 添加水印 # 添加水印
if conf.is_watermark(): if conf.is_watermark():
add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack) add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)
# 移动电影 # 移动电影
paste_file_to_folder(filepath, path, number, leak_word, c_word, hack_word) paste_file_to_folder(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志 # 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, hack_word print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path, tag, json_data.get('actor_list'), liuchu, uncensored, hack_word
,fanart_path,poster_path,thumb_path) ,fanart_path,poster_path,thumb_path)
elif conf.main_mode() == 2: elif conf.main_mode() == 2:
# 创建文件夹 # 创建文件夹
path = create_folder(json_data) path = create_folder(json_data)
# 移动文件 # 移动文件
paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, hack_word) paste_file_to_folder_mode2(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word)
if conf.is_watermark(): if conf.is_watermark():
add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack) add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)
elif conf.main_mode() == 3: elif conf.main_mode() == 3:
path = str(Path(file_path).parent) path = str(Path(movie_path).parent)
if multi_part == 1: if multi_part == 1:
number += part # 这时number会被附加上CD1后缀 number += part # 这时number会被附加上CD1后缀
# 检查小封面, 如果image cut为3则下载小封面 # 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3: if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, hack_word, filepath) small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path)
# creatFolder会返回番号路径 # creatFolder会返回番号路径
image_download( cover, fanart_path,thumb_path, path, filepath) image_download( cover, fanart_path, thumb_path, path, movie_path)
if not multi_part or part.lower() == '-cd1': if not multi_part or part.lower() == '-cd1':
# 下载预告片 # 下载预告片
if conf.is_trailer() and json_data.get('trailer'): if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, filepath) trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, movie_path)
# 下载剧照 data, path, filepath # 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'): if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, number, filepath) extrafanart_download(json_data.get('extrafanart'), path, number, movie_path)
# 裁剪图 # 裁剪图
cutImage(imagecut, path , fanart_path, poster_path) cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored))
# 添加水印 # 添加水印
if conf.is_watermark(): if conf.is_watermark():
add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack) add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志 # 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path,
tag, json_data.get('actor_list'), liuchu, uncensored, hack_word,fanart_path,poster_path,thumb_path) tag, json_data.get('actor_list'), liuchu, uncensored, hack_word,fanart_path,poster_path,thumb_path)

View File

@@ -2,7 +2,7 @@
main_mode=1 main_mode=1
failed_output_folder=data/failure_output failed_output_folder=data/failure_output
success_output_folder=data/organized success_output_folder=data/organized
soft_link=0 link_mode=0
[proxy] [proxy]
proxy= proxy=

View File

@@ -5,8 +5,9 @@ import config
import typing import typing
G_spat = re.compile( G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|" "^\w+\.(cc|com|net|me|club|jp|tv|xyz|biz|wiki|info|tw|us|de)@|^22-sht\.me|"
"^hhd800\.com@|-uncensored|_uncensored|-leak|_leak|-4K|_4K", "^(fhd|hd|sd|1080p|720p|4K)(-|_)|"
"(-|_)(fhd|hd|sd|1080p|720p|4K|x264|x265|uncensored|leak)",
re.IGNORECASE) re.IGNORECASE)
@@ -46,9 +47,13 @@ def get_number(debug: bool, file_path: str) -> str:
lower_check = filename.lower() lower_check = filename.lower()
if 'fc2' in lower_check: if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
filename = re.sub("(-|_)cd\d{1,2}", "", filename, flags=re.IGNORECASE) filename = re.sub("[-_]cd\d{1,2}", "", filename, flags=re.IGNORECASE)
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况例如n1012-CD1.wmv
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
file_number = str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) file_number = str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE) file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE)
if re.search("\d+ch$", file_number, flags=re.I):
file_number = file_number[:-2]
return file_number.upper() return file_number.upper()
else: # 提取不含减号-的番号FANZA CID else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则 # 欧美番号匹配规则
@@ -124,7 +129,8 @@ def is_uncensored(number):
): ):
return True return True
if G_cache_uncensored_conf.is_empty(): if G_cache_uncensored_conf.is_empty():
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(',')) if G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(',')) == None:
return False
return G_cache_uncensored_conf.check(number) return G_cache_uncensored_conf.check(number)
@@ -146,13 +152,23 @@ if __name__ == "__main__":
"caribean-020317_001.nfo", # -号误命名为_号的 "caribean-020317_001.nfo", # -号误命名为_号的
"257138_3xplanet_1Pondo_080521_001.mp4", "257138_3xplanet_1Pondo_080521_001.mp4",
"ADV-R0624-CD3.wmv", # 多碟影片 "ADV-R0624-CD3.wmv", # 多碟影片
"XXX-AV 22061-CD5.iso", # 支持片商格式 xxx-av-22061 命名规则来自javdb数据源 "XXX-AV 22061-CD5.iso", # 支持片商格式 xxx-av-22061 命名规则来自javdb数据源
"xxx-av 20589.mp4", "xxx-av 20589.mp4",
"Muramura-102114_145-HD.wmv", # 支持片商格式 102114_145 命名规则来自javdb数据源 "Muramura-102114_145-HD.wmv", # 支持片商格式 102114_145 命名规则来自javdb数据源
"heydouga-4102-023-CD2.iso", # 支持片商格式 heydouga-4102-023 命名规则来自javdb数据源 "heydouga-4102-023-CD2.iso", # 支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
"HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源 "HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
"pacopacomama-093021_539-FHD.mkv", # 支持片商格式 093021_539 命名规则来自javdb数据源 "pacopacomama-093021_539-FHD.mkv", # 支持片商格式 093021_539 命名规则来自javdb数据源
"sbw99.cc@heyzo_hd_2636_full.mp4" "sbw99.cc@heyzo_hd_2636_full.mp4",
"hhd800.com@STARS-566-HD.mp4",
"jav20s8.com@GIGL-677_4K.mp4",
"sbw99.cc@iesp-653-4K.mp4",
"4K-ABP-358_C.mkv",
"n1012-CD1.wmv",
"[]n1012-CD2.wmv",
"rctd-460ch.mp4", # 除支持-C硬字幕外新支持ch硬字幕
"rctd-461CH-CD2.mp4", # ch后可加CDn
"rctd-461-Cd3-C.mp4", # CDn后可加-C
"rctd-461-C-cD4.mp4", # cD1 Cd1 cd1 CD1 最终生成.nfo时统一为大写CD1
) )

View File

@@ -9,7 +9,7 @@ mkdir build
mkdir __pycache__ mkdir __pycache__
pyinstaller --onefile Movie_Data_Capture.py ` pyinstaller --onefile Movie_Data_Capture.py `
--hidden-import "ImageProcessing.hog" ` --hidden-import "ImageProcessing.cnn" `
--add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" ` --add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` --add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
--add-data "$OPENCC_PATH;opencc" ` --add-data "$OPENCC_PATH;opencc" `

View File

@@ -9,4 +9,4 @@ urllib3==1.24.3
certifi==2020.12.5 certifi==2020.12.5
MechanicalSoup==1.1.0 MechanicalSoup==1.1.0
opencc-python-reimplemented opencc-python-reimplemented
face_recognition face_recognition

View File

@@ -1,8 +1,10 @@
pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448 py38-mechanicalsoup pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448 py38-mechanicalsoup
pip install pyquery pyinstaller pip install pyquery pyinstaller
pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--hidden-import "ImageProcessing.cnn" \
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python3.8 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ --add-data "$(python3.8 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "$(python3.8 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \

View File

@@ -13,8 +13,10 @@
pip3 install -r requirements.txt pip3 install -r requirements.txt
pip3 install cloudscraper==1.2.52 pip3 install cloudscraper==1.2.52
pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--hidden-import "ImageProcessing.cnn" \
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ --add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "$(python3 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \