Merge branch 'upstream'

# Conflicts:
#	WebCrawler/fanza.py
This commit is contained in:
Deng Zhou
2022-04-29 23:53:21 +08:00
34 changed files with 25968 additions and 18336 deletions

View File

@@ -39,7 +39,7 @@ jobs:
run: |
pyinstaller \
--onefile Movie_Data_Capture.py \
--hidden-import "ImageProcessing.hog" \
--hidden-import "ImageProcessing.cnn" \
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \
@@ -51,7 +51,7 @@ jobs:
run: |
pyinstaller `
--onefile Movie_Data_Capture.py `
--hidden-import "ImageProcessing.hog" `
--hidden-import "ImageProcessing.cnn" `
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" `
--add-data "$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1);face_recognition_models" `

View File

@@ -18,6 +18,7 @@ from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from cloudscraper import create_scraper
from concurrent.futures import ThreadPoolExecutor
from unicodedata import category
def getXpathSingle(htmlcode, xpath):
@@ -26,7 +27,7 @@ def getXpathSingle(htmlcode, xpath):
return result1
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
@@ -69,7 +70,6 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
print('[-]Connect Failed! Please check your Proxy or Network!')
raise Exception('Connect Failed')
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
configProxy = config.getInstance().proxy()
errors = ""
@@ -381,7 +381,7 @@ def load_cookies(cookie_json_filename: str):
break
if not cookies_filename:
return None, None
return json.load(open(cookies_filename)), cookies_filename
return json.loads(Path(cookies_filename).read_text(encoding='utf-8')), cookies_filename
except:
return None, None
@@ -466,7 +466,7 @@ def download_file_with_filename(url: str, filename: str, path: str) -> None:
os.makedirs(path)
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
os._exit(0)
proxies = configProxy.proxies()
headers = {
'User-Agent': G_USER_AGENT}
@@ -483,7 +483,7 @@ def download_file_with_filename(url: str, filename: str, path: str) -> None:
os.makedirs(path)
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
os._exit(0)
headers = {
'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -519,14 +519,13 @@ def download_one_file(args) -> str:
wrapped for map function
"""
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
(url, save_path) = args
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
with save_path.open('wb') as fpbyte:
if len(filebytes) == fpbyte.write(filebytes):
return str(save_path)
return _inner(*args)
def parallel_download_files(dn_list: typing.Iterable[typing.Sequence], parallel: int = 0):
"""
@@ -567,6 +566,7 @@ def delete_all_elements_in_list(string: str, lists: typing.Iterable[str]):
new_lists.append(i)
return new_lists
def delete_all_elements_in_str(string_delete: str, string: str):
"""
delete same string in given list
@@ -574,4 +574,9 @@ def delete_all_elements_in_str(string_delete: str, string: str):
for i in string:
if i == string_delete:
string = string.replace(i,"")
return string
return string
# print format空格填充对齐内容包含中文时的空格计算
def cnspace(v: str, n: int) -> int:
return n - [category(c) for c in v].count('Lo')

View File

@@ -1,12 +1,18 @@
import sys
sys.path.append('../')
import logging
import os
import config
import importlib
from pathlib import Path
from PIL import Image
import shutil
from ADC_function import file_not_exist_or_empty
def face_crop_width(filename, width, height):
aspect_ratio = config.getInstance().face_aspect_ratio()
# 新宽度是高度的2/3
cropWidthHalf = int(height/3)
try:
@@ -21,15 +27,15 @@ def face_crop_width(filename, width, height):
# 越界处理
if cropLeft < 0:
cropLeft = 0
cropRight = cropWidthHalf*2
cropRight = cropWidthHalf * aspect_ratio
elif cropRight > width:
cropLeft = width-cropWidthHalf*2
cropLeft = width - cropWidthHalf * aspect_ratio
cropRight = width
return (cropLeft, 0, cropRight, height)
except:
print('[-]Not found face! ' + filename)
# 默认靠右切
return (width-cropWidthHalf*2, 0, width, height)
return (width-cropWidthHalf * aspect_ratio, 0, width, height)
def face_crop_height(filename, width, height):
@@ -54,29 +60,43 @@ def face_crop_height(filename, width, height):
return (0, 0, width, cropHeight)
def cutImage(imagecut, path, fanart_path, poster_path):
def cutImage(imagecut, path, fanart_path, poster_path, skip_facerec=False):
conf = config.getInstance()
fullpath_fanart = os.path.join(path, fanart_path)
fullpath_poster = os.path.join(path, poster_path)
if imagecut == 1: # 剪裁大封面
aspect_ratio = conf.face_aspect_ratio()
if conf.face_aways_imagecut():
imagecut = 1
elif conf.download_only_missing_images() and not file_not_exist_or_empty(fullpath_poster):
return
# imagecut为4时同时也是有码影片 也用人脸识别裁剪封面
if imagecut == 1 or imagecut == 4: # 剪裁大封面
try:
img = Image.open(fullpath_fanart)
width, height = img.size
if width/height > 2/3: # 如果宽度大于2
# 以人像为中心切取
img2 = img.crop(face_crop_width(fullpath_fanart, width, height))
if imagecut == 4:
# 以人像为中心切取
img2 = img.crop(face_crop_width(fullpath_fanart, width, height))
elif skip_facerec:
# 有码封面默认靠右切
img2 = img.crop((width - int(height / 3) * aspect_ratio, 0, width, height))
else:
# 以人像为中心切取
img2 = img.crop(face_crop_width(fullpath_fanart, width, height))
elif width/height < 2/3: # 如果高度大于3
# 从底部向上切割
img2 = img.crop(face_crop_height(fullpath_fanart, width, height))
else: # 如果等于2/3
img2 = img
img2.save(fullpath_poster)
print('[+]Image Cutted! ' + fullpath_poster)
print(f"[+]Image Cutted! {Path(fullpath_poster).name}")
except Exception as e:
print(e)
print('[-]Cover cut failed!')
elif imagecut == 0: # 复制封面
shutil.copyfile(fullpath_fanart, fullpath_poster)
print('[+]Image Copyed! ' + fullpath_poster)
print(f"[+]Image Copyed! {Path(fullpath_poster).name}")
def face_center(filename, model):
@@ -91,5 +111,5 @@ def face_center(filename, model):
return (0, 0)
if __name__ == '__main__':
cutImage(1,'H:\\test\\','12.jpg','test.jpg')
cutImage(1,'z:/t/','p.jpg','o.jpg')
#cutImage(1,'H:\\test\\','12.jpg','test.jpg')

View File

@@ -1,4 +1,8 @@
import hog
import sys
sys.path.append('../')
from ImageProcessing.hog import face_center as hog_face_center
def face_center(filename, model):
return hog.face_center(filename, model)
return hog_face_center(filename, model)

View File

@@ -7,18 +7,20 @@ SHELL = /bin/bash
.DEFAULT: make
make:
#@echo "[+]make prepare-dev"
#sudo apt-get -y install python3.7 python3-pip
#pip3 install -r requirements.txt
#pip3 install pyinstaller
@echo "[+]make prepare-dev"
#sudo apt-get -y install python3 python3-pip
pip3 install -r requirements.txt
pip3 install pyinstaller
#@echo "[+]Set CLOUDSCRAPER_PATH variable"
#export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1)
@echo "[+]Pyinstaller make"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--hidden-import "ImageProcessing.cnn" \
--add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \
--add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \
--add-data "`python3 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1`:face_recognition_models" \
--add-data "Img:Img" \
--add-data "config.ini:." \

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!-- 说明:可使用文本编辑器打开本文件后自行编辑。
keyword用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。
zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。当输出词为“删除”时表示遇到该关键词时在对应内容中删除该关键词-->
@@ -575,7 +575,7 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/>
<a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/>
<a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/>
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,サディスティックヴィレッジ,"/>
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,"/>
<a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/>
<a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/>
<a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/>
@@ -600,28 +600,5 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/>
<a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/>
<!-- 2021-11-8 Update -->
<a zh_cn="Energy" zh_tw="Energy" jp="アイエナジー" keyword=",アイエナジー,"/>
<a zh_cn="Idea Pocket" zh_tw="Idea Pocket" jp="アイデアポケット" keyword=",アイデアポケット,"/>
<a zh_cn="AKNR" zh_tw="AKNR" jp="アキノリ" keyword=",アキノリ,"/>
<a zh_cn="Attackers" zh_tw="Attackers" jp="アタッカーズ" keyword=",アタッカーズ,"/>
<a zh_cn="Alice Japan" zh_tw="Alice Japan" jp="アリスJAPAN" keyword=",アリスJAPAN,"/>
<a zh_cn="Aurora Project Annex" zh_tw="Aurora Project Annex" jp="オーロラプロジェクト・アネックス" keyword=",オーロラプロジェクト・アネックス,"/>
<a zh_cn="Crystal 映像" zh_tw="Crystal 映像" jp="クリスタル映像" keyword=",クリスタル映像,"/>
<a zh_cn="Glory Quest" zh_tw="Glory Quest" jp="グローリークエスト" keyword=",グローリークエスト,"/>
<a zh_cn="DAS" zh_tw="DAS" jp="ダスッ!" keyword=",ダスッ!,"/>
<a zh_cn="DEEPs" zh_tw="DEEPs" jp="ディープス" keyword=",ディープス,"/>
<a zh_cn="Dogma" zh_tw="Dogma" jp="ドグマ" keyword=",ドグマ,"/>
<a zh_cn="宇宙企画" zh_tw="宇宙企画" jp="メディアステーション" keyword=",メディアステーション,"/>
<a zh_cn="WANZ FACTORY" zh_tw="WANZ FACTORY" jp="ワンズファクトリー" keyword=",ワンズファクトリー,"/>
<a zh_cn="VR PRODUCE" zh_tw="VR PRODUCE" jp="VRプロダクツ" keyword=",VRプロダクツ,VRPRODUCE,"/>
<a zh_cn="Real Works" zh_tw="Real Works" jp="レアルワークス" keyword=",レアルワークス,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="マックスエー" keyword=",マックスエー,"/>
<a zh_cn="PETERS MAX" zh_tw="PETERS MAX" jp="ピーターズMAX" keyword=",ピーターズMAX,"/>
<a zh_cn="NATURAL HIGH" zh_tw="NATURAL HIGH" jp="ナチュラルハイ" keyword=",ナチュラルハイ,"/>
<a zh_cn="MAXING" zh_tw="MAXING" jp="マキシング" keyword=",マキシング,"/>
<a zh_cn="Ms Video Group" zh_tw="Ms Video Group" jp="エムズビデオグループ" keyword=",エムズビデオグループ,"/>
<a zh_cn="Minimum" zh_tw="Minimum" jp="ミニマム" keyword=",ミニマム,"/>
<a zh_cn="WAAP Entertainment" zh_tw="WAAP Entertainment" jp="ワープエンタテインメント" keyword=",ワープエンタテインメント,"/>
<a zh_cn="pacopacomama" zh_tw="pacopacomama" jp="パコパコママ" keyword=",pacopacomama,パコパコママ,"/>
</info>

View File

@@ -18,7 +18,7 @@ from opencc import OpenCC
import config
from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number
from core import core_main, moveFailedFolder
from core import core_main, core_main_no_net_op, moveFailedFolder
def check_update(local_version):
@@ -40,7 +40,7 @@ def check_update(local_version):
print("[*]======================================================")
def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool, bool]:
conf = config.getInstance()
parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
@@ -49,6 +49,8 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
# parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
parser.add_argument("-L", "--link-mode", default='', nargs='?',
help="Create movie file link. 0:moving movie file, do not create link 1:soft link 2:try hard link first")
default_logdir = str(Path.home() / '.mlogs')
parser.add_argument("-o", "--log-dir", dest='logdir', default=default_logdir, nargs='?',
help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
@@ -60,12 +62,22 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool]:
help="Override nfo_skip_days value in config.")
parser.add_argument("-c", "--stop-counter", dest='cnt', default='', nargs='?',
help="Override stop_counter value in config.")
parser.add_argument("-R", "--rerun-delay", dest='delaytm', default='', nargs='?',
help="Delay (eg. 1h10m30s or 60 (second)) time and rerun, until all movies proceed. Note: stop_counter value in config or -c must none zero.")
parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
parser.add_argument("-a", "--auto-exit", action="store_true",
help="Auto exit after program complete")
parser.add_argument("-g", "--debug", action="store_true",
help="Turn on debug mode to generate diagnostic log for issue report.")
parser.add_argument("-N", "--no-network-operation", action="store_true",
help="No network query, do not get metadata, for cover cropping purposes, only takes effect when main mode is 3.")
parser.add_argument("-w", "--website", dest='site', default='', nargs='?',
help="Override [priority]website= in config.")
parser.add_argument("-D", "--download-images", dest='dnimg', action="store_true",
help="Override [common]download_only_missing_images=0 force invoke image downloading.")
parser.add_argument("-C", "--config-override", dest='cfgcmd', default='', nargs='?',
help="Common use config override. grammar: section:key=value[;[section:]key=value] eg. 'de:s=1' or 'debug_mode:switch=1' override[debug_mode]switch=1")
parser.add_argument("-z", "--zero-operation", dest='zero_op', action="store_true",
help="""Only show job list of files and numbers, and **NO** actual operation
is performed. It may help you correct wrong numbers before real job.""")
@@ -73,24 +85,40 @@ is performed. It may help you correct wrong numbers before real job.""")
args = parser.parse_args()
def get_natural_number_or_none(value):
return int(value) if isinstance(value, str) and value.isnumeric() and int(value) >= 0 else None
def set_natural_number_or_none(sk, value):
if isinstance(value, str) and value.isnumeric() and int(value) >= 0:
conf.set_override(f'{sk}={value}')
def get_str_or_none(value):
return value if isinstance(value, str) and len(value) else None
def set_str_or_none(sk, value):
if isinstance(value, str) and len(value):
conf.set_override(f'{sk}={value}')
def get_bool_or_none(value):
return True if isinstance(value, bool) and value else None
def set_bool_or_none(sk, value):
if isinstance(value, bool) and value:
conf.set_override(f'{sk}=1')
config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode)
config.G_conf_override["common:source_folder"] = get_str_or_none(args.path)
config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit)
config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days)
config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt)
config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list)
config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug)
set_natural_number_or_none("common:main_mode", args.main_mode)
set_natural_number_or_none("common:link_mode", args.link_mode)
set_str_or_none("common:source_folder", args.path)
set_bool_or_none("common:auto_exit", args.auto_exit)
set_natural_number_or_none("common:nfo_skip_days", args.days)
set_natural_number_or_none("common:stop_counter", args.cnt)
set_bool_or_none("common:ignore_failed_list", args.ignore_failed_list)
set_str_or_none("common:rerun_delay", args.delaytm)
set_str_or_none("priority:website", args.site)
if isinstance(args.dnimg, bool) and args.dnimg:
conf.set_override("common:download_only_missing_images=0")
set_bool_or_none("debug_mode:switch", args.debug)
if isinstance(args.cfgcmd, str) and len(args.cfgcmd.strip()):
conf.set_override(args.cfgcmd.strip())
return args.file, args.number, args.logdir, args.regexstr, args.zero_op
no_net_op = False
if conf.main_mode() == 3:
no_net_op = args.no_network_operation
if no_net_op:
conf.set_override("common:stop_counter=0;rerun_delay=0s;face:aways_imagecut=1")
return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op
class OutLogger(object):
@@ -113,9 +141,12 @@ class OutLogger(object):
self.log.write(msg)
def flush(self):
self.term.flush()
self.log.flush()
os.fsync(self.log.fileno())
if 'flush' in dir(self.term):
self.term.flush()
if 'flush' in dir(self.log):
self.log.flush()
if 'fileno' in dir(self.log):
os.fsync(self.log.fileno())
def close(self):
if self.term is not None:
@@ -244,39 +275,42 @@ def close_logfile(logdir: str):
except:
pass
# 第三步,月合并到年
if today.month < 4:
return
mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{6}$', f.stem, re.A)]
if not mons or not len(mons):
return
mons.sort()
deadline_year = f'mdc_{today.year - 1}13'
year_merge = [f for f in mons if f.stem < deadline_year]
if not year_merge or not len(year_merge):
return
toyear = len('12.txt') # cut length mdc_2020|12.txt
for f in year_merge:
try:
year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt
with open(year_file_name, 'a', encoding='utf-8') as y:
y.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
for i in range(1):
if today.month < 4:
break
mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^mdc_\d{6}$', f.stem, re.A)]
if not mons or not len(mons):
break
mons.sort()
deadline_year = f'mdc_{today.year - 1}13'
year_merge = [f for f in mons if f.stem < deadline_year]
if not year_merge or not len(year_merge):
break
toyear = len('12.txt') # cut length mdc_2020|12.txt
for f in year_merge:
try:
year_file_name = str(f)[:-toyear] + '.txt' # mdc_2020.txt
with open(year_file_name, 'a', encoding='utf-8') as y:
y.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第四步,压缩年志 如果有压缩需求请自行手工压缩或者使用外部脚本来定时完成。推荐nongnu的lzip对于
# 这种粒度的文本日志压缩比是目前最好的。lzip -9的运行参数下日志压缩比要高于xz -9而且内存占用更少
# 多核利用率更高(plzip多线程版本)解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右,
# 100MB的日志文件能缩小到3.7MB。
return filepath
def signal_handler(*args):
print('[!]Ctrl+C detected, Exit.')
sys.exit(9)
os._exit(9)
def sigdebug_handler(*args):
config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"]
print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF'))
conf = config.getInstance()
conf.set_override(f"debug_mode:switch={int(not conf.debug())}")
print(f"[!]Debug {('oFF', 'On')[int(conf.debug())]}")
# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理提示跳过视频总数调试模式(-g)下详细被跳过文件,跳过小广告
@@ -285,7 +319,7 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
main_mode = conf.main_mode()
debug = conf.debug()
nfo_skip_days = conf.nfo_skip_days()
soft_link = conf.soft_link()
link_mode = conf.link_mode()
file_type = conf.media_type().lower().split(",")
trailerRE = re.compile(r'-trailer\.', re.IGNORECASE)
cliRE = None
@@ -296,7 +330,7 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
pass
failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt'
failed_set = set()
if (main_mode == 3 or soft_link) and not conf.ignore_failed_list():
if (main_mode == 3 or link_mode) and not conf.ignore_failed_list():
try:
flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines()
failed_set = set(flist)
@@ -327,20 +361,24 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
print('[!]Skip failed movie:', absf)
continue
is_sym = full_name.is_symlink()
if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标
continue # file is symlink or hardlink(Linux/NTFS/Darwin)
if main_mode != 3 and (is_sym or (full_name.stat().st_nlink > 1 and not conf.scan_hardlink())): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标
continue # 模式不等于3下跳过软连接和未配置硬链接刮削
# 调试用0字节样本允许通过去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size直接赋0跳过小视频检测
if 0 < movie_size < 125829120: # 1024*1024*120=125829120
continue
if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
continue
if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(
full_name.with_suffix('.nfo')) <= nfo_skip_days:
skip_nfo_days_cnt += 1
if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
continue
if main_mode == 3:
nfo = full_name.with_suffix('.nfo')
if not nfo.is_file():
if debug:
print(f"[!]Metadata {nfo.name} not found for '{absf}'")
elif nfo_skip_days > 0 and file_modification_days(nfo) <= nfo_skip_days:
skip_nfo_days_cnt += 1
if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
continue
total.append(absf)
if skip_failed_cnt:
@@ -348,13 +386,13 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
if skip_nfo_days_cnt:
print(
f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
if nfo_skip_days <= 0 or not soft_link or main_mode == 3:
if nfo_skip_days <= 0 or not link_mode or main_mode == 3:
return total
# 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的
skip_numbers = set()
success_folder = Path(conf.success_folder()).resolve()
for f in success_folder.glob(r'**/*'):
if not re.match(r'\.nfo', f.suffix, re.IGNORECASE):
if not re.match(r'\.nfo$', f.suffix, re.IGNORECASE):
continue
if file_modification_days(f) > nfo_skip_days:
continue
@@ -388,7 +426,7 @@ def create_failed_folder(failed_folder: str):
os.makedirs(failed_folder)
except:
print(f"[-]Fatal error! Can not make folder '{failed_folder}'")
sys.exit(0)
os._exit(0)
def rm_empty_folder(path):
@@ -405,38 +443,44 @@ def rm_empty_folder(path):
pass
def create_data_and_move(file_path: str, zero_op, oCC):
def create_data_and_move(movie_path: str, zero_op: bool, no_net_op: bool, oCC):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
debug = config.getInstance().debug()
n_number = get_number(debug, os.path.basename(file_path))
file_path = os.path.abspath(file_path)
n_number = get_number(debug, os.path.basename(movie_path))
movie_path = os.path.abspath(movie_path)
if debug is True:
print(f"[!] [{n_number}] As Number making data for '{file_path}'")
print(f"[!] [{n_number}] As Number making data for '{movie_path}'")
if zero_op:
return
if n_number:
core_main(file_path, n_number, oCC)
if no_net_op:
core_main_no_net_op(movie_path, n_number)
else:
core_main(movie_path, n_number, oCC)
else:
print("[-] number empty ERROR")
moveFailedFolder(file_path)
moveFailedFolder(movie_path)
print("[*]======================================================")
else:
try:
print(f"[!] [{n_number}] As Number making data for '{file_path}'")
print(f"[!] [{n_number}] As Number making data for '{movie_path}'")
if zero_op:
return
if n_number:
core_main(file_path, n_number, oCC)
if no_net_op:
core_main_no_net_op(movie_path, n_number)
else:
core_main(movie_path, n_number, oCC)
else:
raise ValueError("number empty")
print("[*]======================================================")
except Exception as err:
print(f"[-] [{file_path}] ERROR:")
print(f"[-] [{movie_path}] ERROR:")
print('[-]', err)
try:
moveFailedFolder(file_path)
moveFailedFolder(movie_path)
except Exception as err:
print('[!]', err)
@@ -455,7 +499,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
print("[-] [{}] ERROR:".format(file_path))
print('[-]', err)
if conf.soft_link():
if conf.link_mode():
print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, os.path.join(conf.failed_folder(), file_name))
else:
@@ -466,23 +510,14 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
print('[!]', err)
def main():
version = '6.0.2'
urllib3.disable_warnings() # Ignore http proxy warning
# Read config.ini first, in argparse_function() need conf.failed_folder()
conf = config.Config("config.ini")
# Parse command line args
single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version)
def main(args: tuple) -> Path:
(single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op) = args
conf = config.getInstance()
main_mode = conf.main_mode()
folder_path = ""
if main_mode not in (1, 2, 3):
print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
sys.exit(4)
os._exit(4)
signal.signal(signal.SIGINT, signal_handler)
if sys.platform == 'win32':
@@ -508,8 +543,8 @@ def main():
print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug():
print('[+]Enable debug')
if conf.soft_link():
print('[!]Enable soft link')
if conf.link_mode() in (1, 2):
print('[!]Enable {} link'.format(('soft', 'hard')[conf.link_mode() - 1]))
if len(sys.argv) > 1:
print('[!]CmdLine:', " ".join(sys.argv[1:]))
print('[+]Main Working mode ## {}: {} ## {}{}{}'
@@ -521,7 +556,10 @@ def main():
)
if conf.update_check():
check_update(version)
try:
check_update(version)
except Exception as e:
print('[-]Update check failed!',e)
create_failed_folder(conf.failed_folder())
@@ -539,15 +577,21 @@ def main():
if file_modification_days(str(v)) >= conf.mapping_table_validity():
print("[+]Mapping Table Out of date! Remove", str(v))
os.remove(str(v))
res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists()))
for i, fp in enumerate(res, start=1):
if fp and len(fp):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
else:
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
print("[-] --- AUTO EXIT AFTER 30s !!! --- ")
time.sleep(30)
os._exit(-1)
try:
res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists()))
for i, fp in enumerate(res, start=1):
if fp and len(fp):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
else:
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
except Exception as e:
print("[!] ==================== ERROR ====================")
print("[!] " + "Mapping Table Download FAILED".center(47))
print("[!] " + "无法连接github".center(47))
print("[!] " + "请过几小时再试试".center(47))
print("[-] " + "------ AUTO EXIT AFTER 30s !!! ------ ".center(47))
time.sleep(30)
os._exit(-1)
# create OpenCC converter
ccm = conf.cc_convert_mode()
@@ -587,7 +631,7 @@ def main():
percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -',
time.strftime("%H:%M:%S")))
create_data_and_move(movie_path, zero_op, oCC)
create_data_and_move(movie_path, zero_op, no_net_op, oCC)
if count >= stop_count:
print("[!]Stop counter triggered!")
break
@@ -605,14 +649,68 @@ def main():
print("[+]All finished!!!")
close_logfile(logdir)
return close_logfile(logdir)
if not conf.auto_exit():
input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0)
def 分析日志文件(logfile):
try:
if not (isinstance(logfile, Path) and logfile.is_file()):
raise FileNotFoundError('log file not found')
logtxt = logfile.read_text(encoding='utf-8')
扫描电影数 = int(re.findall(r'\[\+]Find (.*) movies\.', logtxt)[0])
已处理 = int(re.findall(r'\[1/(.*?)] -', logtxt)[0])
完成数 = logtxt.count(r'[+]Wrote!')
return 扫描电影数, 已处理, 完成数
except:
return None, None, None
def period(delta, pattern):
d = {'d': delta.days}
d['h'], rem = divmod(delta.seconds, 3600)
d['m'], d['s'] = divmod(rem, 60)
return pattern.format(**d)
if __name__ == '__main__':
multiprocessing.freeze_support()
main()
version = '6.1.1'
urllib3.disable_warnings() # Ignore http proxy warning
app_start = time.time()
# Read config.ini first, in argparse_function() need conf.failed_folder()
conf = config.Config("config.ini")
# Parse command line args
args = tuple(argparse_function(version))
再运行延迟 = conf.rerun_delay()
if 再运行延迟 > 0 and conf.stop_counter() > 0:
while True:
try:
logfile = main(args)
(扫描电影数, 已处理, 完成数) = 分析结果元组 = tuple(分析日志文件(logfile))
if all(isinstance(v, int) for v in 分析结果元组):
剩余个数 = 扫描电影数 - 已处理
总用时 = timedelta(seconds = time.time() - app_start)
print(f'All movies:{扫描电影数} processed:{已处理} successes:{完成数} remain:{剩余个数}' +
' Elapsed time {}'.format(
period(总用时, "{d} day {h}:{m:02}:{s:02}") if 总用时.days == 1
else period(总用时, "{d} days {h}:{m:02}:{s:02}") if 总用时.days > 1
else period(总用时, "{h}:{m:02}:{s:02}")))
if 剩余个数 == 0:
break
下次运行 = datetime.now() + timedelta(seconds=再运行延迟)
print(f'Next run time: {下次运行.strftime("%H:%M:%S")}, rerun_delay={再运行延迟}, press Ctrl+C stop run.')
time.sleep(再运行延迟)
else:
break
except:
break
else:
main(args)
if not conf.auto_exit():
if sys.platform == 'win32':
input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0)

View File

@@ -25,7 +25,7 @@ CLI 版本
# 文档
* [官方教程WIKI](https://github.com/yoshiko2/Movie_Data_Capture/wiki)
* [VergilGao's Docker部署](https://github.com/VergilGao/docker-avdc)
* [VergilGao's Docker部署](https://github.com/VergilGao/docker-mdc)
# 下载
* [Releases](https://github.com/yoshiko2/Movie_Data_Capture/releases/latest)
@@ -36,43 +36,40 @@ CLI 版本
# 申明
当你查阅、下载了本项目源代码或二进制程序,即代表你接受了以下条款
*软件仅供技术交流,学术交流使用
*项目和项目成果仅供技术,学术交流和Python3性能测试使用
* **请勿在墙内的社交平台上宣传此项目**
*软件作者编写出该软件旨在学习 Python ,提高编程水平
*软件不提供任何影片下载的线索
* 用户在使用本软件前,请用户了解并遵守当地法律法规,如果本软件使用过程中存在违反当地法律法规的行为,请勿使用该软件
* 用户在使用本软件时,若用户在当地产生一切违法行为由用户承担
* 严禁用户将本软件使用于商业和个人其他意图
*项目贡献者编写该项目旨在学习Python3 ,提高编程水平
*项目不提供任何影片下载的线索
* 用户在使用本项目和项目成果前,请用户了解并遵守当地法律法规,如果本项目及项目成果使用过程中存在违反当地法律法规的行为,请勿使用该项目及项目成果
* 用户在使用本项目和项目成果时,若用户在当地产生一切违法行为由用户承担
* 严禁用户将本项目和项目成果使用于商业和个人其他意图
* 源代码和二进制程序请在下载后24小时内删除
* 出售源码者的母亲会升天
* 本项目发起者yoshiko2保留最终决定权和最终解释权
* 若用户不同意上述条款任意一条,请勿使用本软件
* 用户使用本项目及项目成果所造成的一切后果由用户自行承担,贡献者概不负责
* 若用户不同意上述条款任意一条,请勿使用本项目和项目成果
---
When you view and download the source code or binary program of this project, it means that you have accepted the following terms
* This software is only for technical exchange and academic exchange
* This project is only for technical exchange, academic exchange and Python3 performance test
* **Please do not promote this project on popular social platforms**
* The software author wrote this software to learn Python and improve programming
* This software does not provide any clues for video download
* Before using this software, please understand and abide by local laws and regulations. If there is any violation of local laws and regulations during the use of this software, * please do not use this software
* When the user uses this software, if the user has any illegal acts in the local area, the user shall bear
* It is strictly forbidden for users to use this software for commercial and personal intentions
* The project contributors wrote this project to learn Python and improve programming
* This project does not provide any clues for video download
* Before using this project results, please understand and abide by local laws and regulations. If there is any violation of local laws and regulations during the use of this project results, * please do not use this project results
* When the user uses this project results, if the user has any illegal acts in the local area, the user shall bear
* It is strictly forbidden for users to use this project and project results for commercial and personal intentions
* Please delete the source code and binary program within 24 hours after downloading
* The mother of the source seller will die
* The author of this software yoshiko2 reserves the right of final decision and final interpretation
* If the user does not agree with any of the above terms, please do not use this software
* All consequences caused by the user's use of this project and project results shall be borne by the user, and the contributors shall not be responsible
* If the user does not agree with any of the above terms, please do not use this project results and project
---
本プロジェクトのソースコード、バイナリファイルをダウンロード、または表示するしたうえで、あなたは本規約に同意したものと見なします。
* このソフトウェアは、開発技術学習することのみに使用できます。
* このプロジェクトは、開発技術学習、Python3性能テストすることのみに使用できます。
* **ソーシャルメディアで本プロジェクトの宣伝をご遠慮ください**
* 者はPythonの勉強と技術力の向上のために、このソフトウェアを作成しました
*ソフトウェアは、あらゆる動画ダウンロード機能一切提供しません
*ソフトウェアを使用する前に、現地の法律規範をよく理解する必要があります。あなたは、適用される現地の法令を順守する責任を負います
*ソフトウェアを使用した結果生じた損害や法的責任につきまして作者は一切責任を負いません
*ソフトウェアを商用、業務、その他の営利目的のために使用することは一切禁止します。
* 貢献者はPythonの勉強と技術力の向上のために、このソフトウェアを作成しました
*プロジェクトは、あらゆる動画ダウンロード機能一切提供しません
*プロジェクトとプロジェクトの成果を使用する前に、現地の法律規範をよく理解する必要があります。あなたは、適用される現地の法令を順守する責任を負います
*プロジェクトとプロジェクトの成果を使用した結果生じた損害や法的責任につきまして作者は一切責任を負いません
*プロジェクトとプロジェクトの成果を商用、業務、その他の営利目的のために使用することは一切禁止します。
* 本プロジェクトのソースコード、バイナリファイルをダウンロードした場合、24時間以内に削除してください
* 元売り手の母親が天に召される
* 最終解釈権は作者yoshiko2に属します
* ユーザーによるこのプロジェクトの使用およびプロジェクトの結果によって引き起こされるすべての結果は、ユーザーが負担するものとし、寄稿者は責任を負わないものとします。
* 本規約およびすべての適用法、規約および規則を遵守する場合にのみ本ソフトウェアを使用することができます

View File

@@ -24,6 +24,7 @@ from . import carib
from . import fc2club
from . import mv91
from . import madou
from . import gcolle
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
@@ -62,7 +63,8 @@ def get_data_from_json(file_number, oCC):
"carib": carib.main,
"fc2club": fc2club.main,
"mv91": mv91.main,
"madou": madou.main
"madou": madou.main,
"gcolle": gcolle.main,
}
conf = config.getInstance()
@@ -91,6 +93,8 @@ def get_data_from_json(file_number, oCC):
sources.insert(0, sources.pop(sources.index("fc2")))
if "fc2club" in sources:
sources.insert(0, sources.pop(sources.index("fc2club")))
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources.insert(0, sources.pop(sources.index("gcolle")))
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
@@ -100,6 +104,12 @@ def get_data_from_json(file_number, oCC):
sources.insert(0, sources.pop(sources.index("javdb")))
if "xcity" in sources:
sources.insert(0, sources.pop(sources.index("xcity")))
if "madou" in sources:
sources.insert(0, sources.pop(sources.index("madou")))
elif "madou" in sources and (
re.match(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources.insert(0, sources.pop(sources.index("madou")))
# check sources in func_mapping
todel = []
@@ -124,7 +134,10 @@ def get_data_from_json(file_number, oCC):
for source in sources:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
try:
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
except:
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -136,7 +149,10 @@ def get_data_from_json(file_number, oCC):
try:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number))
try:
json_data = json.loads(func_mapping[source](file_number))
except:
json_data = func_mapping[source](file_number)
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
@@ -242,8 +258,8 @@ def get_data_from_json(file_number, oCC):
if json_data[translate_value] == "":
continue
if translate_value == "title":
title_dict = json.load(
open(str(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json'), 'r', encoding="utf-8"))
title_dict = json.loads(
(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
try:
json_data[translate_value] = title_dict[number]
continue

View File

@@ -5,6 +5,7 @@ from lxml import etree
import json
from ADC_function import *
from WebCrawler.storyline import getStoryline
from WebCrawler.crawler import *
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -17,95 +18,64 @@ def getActorPhoto(html):
p2 = {t: l}
d.update(p2)
return d
def getTitle(html):
try:
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '')
except:
return ''
def getActor(html):
a = html.xpath('//a[@class="avatar-box"]')
d = []
for i in a:
d.append(i.find('span').text)
return d
def getStudio(html):
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1
def getRuntime(html):
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1
def getLabel(html):
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1
def getNum(html):
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1
def getYear(release):
try:
result = str(re.search('\d{4}',release).group())
return result
except:
return release
def getRelease(html):
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1
def getCover(html):
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result
def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
def getTag(html):
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x[2:]] if len(x) > 2 else []
def getSeries(html):
try:
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
return ''
def main(number):
html = get_html('https://tellme.pw/avsox')
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
a = get_html(site + '/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
detail = get_html("https:" + result1)
lx = etree.fromstring(detail, etree.HTMLParser())
avsox_crawler2 = Crawler(a)
avsox_crawler = Crawler(detail)
try:
new_number = getNum(lx)
new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
if new_number.upper() != number.upper():
raise ValueError('number not found')
title = getTitle(lx).strip(new_number)
title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
dic = {
'actor': getActor(lx),
'title': title,
'studio': getStudio(lx),
'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
'outline': getStoryline(number, title),
'runtime': getRuntime(lx),
'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
'director': '', #
'release': getRelease(lx),
'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
'number': new_number,
'cover': getCover(lx),
'cover_small': getCover_small(html),
'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
#'cover_small' : getCover_small(html),
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
'imagecut': 3,
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(getRelease(lx)),
'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
'actor_photo': getActorPhoto(lx),
'website': "https:" + result1,
'source': 'avsox.py',
'series': getSeries(lx),
'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
}
except Exception as e:
if config.getInstance().debug():

View File

@@ -40,6 +40,7 @@ def main(number: str) -> json:
'website': f'{G_SITE}/moviepages/{number}/index.html',
'source': 'carib.py',
'series': get_series(lx),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
@@ -59,7 +60,7 @@ def get_year(lx: html.HtmlElement) -> str:
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title)
g = getStoryline(number, title, 无码=True)
if len(g):
return g
return o

28
WebCrawler/crawler.py Normal file
View File

@@ -0,0 +1,28 @@
from lxml import etree
class Crawler:
def __init__(self,htmlcode):
self.html = etree.HTML(htmlcode)
def getString(self,_xpath):
if _xpath == "":
return ""
result = self.html.xpath(_xpath)
try:
return result[0]
except:
return ""
def getStrings(self,_xpath):
result = self.html.xpath(_xpath)
try:
return result
except:
return ""
def getOutline(self,_xpath):
result = self.html.xpath(_xpath)
try:
return "\n".join(result)
except:
return ""

View File

@@ -1,15 +1,14 @@
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from ADC_function import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
#title //*[@id="work_name"]/a/text()
#print(get_html('https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html'))
#title /html/head/title/text()
#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
#release //th[contains(text(),"販売日")]/../td/a/text()
#story //th[contains(text(),"シナリオ")]/../td/a/text()
@@ -18,14 +17,14 @@ from ADC_function import *
#jianjie //*[@id="main_inner"]/div[3]/text()
#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src
#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html
#https://www.dlsite.com/maniax/work/=/product_id/VJ013152.html
def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath('//*[@id="work_name"]/a/text()')[0]
def getTitle(html):
result = str(html.xpath('/html/head/title/text()')[0])
result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
@@ -38,8 +37,7 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
p={i:''}
d.update(p)
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getStudio(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -53,8 +51,7 @@ def getRuntime(a):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getLabel(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -69,12 +66,10 @@ def getYear(getRelease):
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRelease(html):
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getTag(html):
try:
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result
@@ -96,26 +91,22 @@ def getCover_small(a, index=0):
if not 'https' in result:
result = 'https:' + result
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getCover(html):
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
return result.replace('.webp', '.jpg')
def getDirector(html):
try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getOutline(html):
total = []
result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
result = html.xpath('//*[@class="work_parts_area"]/p/text()')
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getSeries(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
@@ -127,28 +118,28 @@ def getSeries(a):
def main(number):
try:
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN',
cookies={'locale': 'zh-cn'})
html = etree.fromstring(htmlcode, etree.HTMLParser())
dic = {
'actor': getActor(htmlcode),
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode),
'actor': getActor(html),
'title': getTitle(html),
'studio': getStudio(html),
'outline': getOutline(html),
'runtime': '',
'director': getDirector(htmlcode),
'release': getRelease(htmlcode),
'director': getDirector(html),
'release': getRelease(html),
'number': number,
'cover': 'https:' + getCover(htmlcode),
'cover': 'https:' + getCover(html),
'cover_small': '',
'imagecut': 0,
'tag': getTag(htmlcode),
'label': getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
'tag': getTag(html),
'label': getLabel(html),
'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(htmlcode),
'series': getSeries(html),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -166,4 +157,6 @@ def main(number):
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.getInstance().set_override("debug_mode:switch=1")
print(main('VJ013178'))
print(main('RJ329607'))

View File

@@ -9,130 +9,33 @@ from urllib.parse import urlencode
from lxml import etree
from ADC_function import *
from WebCrawler.crawler import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
class fanzaCrawler(Crawler):
def getFanzaString(self,string):
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getTitle(text):
html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[starts-with(@id, "title")]/text()')[0]
return result
def getFanzaStrings(self, string):
result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
if len(result1) > 0:
return result1
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2
def getActor(text):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(text, etree.HTMLParser())
result = (
str(
html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
)
)
.strip(" ['']")
.replace("', '", ",")
)
return result
def getRelease(fanza_Crawler):
result = fanza_Crawler.getFanzaString('発売日:')
if result == '----':
result = fanza_Crawler.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getStudio(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/text()"
)[0]
return result
def getRuntime(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group()
def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'レーベル:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'レーベル:')]/following-sibling::td/text()"
)[0]
return result
def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0]
return result
def getYear(getRelease):
try:
result = str(re.search(r"\d{4}", getRelease).group())
return result
except:
return getRelease
def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
try:
result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
except:
result = "----"
if result == "----":
try:
result = html.xpath(
"//td[contains(text(),'配信開始日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except:
try:
result = html.xpath(
"//td[contains(text(),'配信開始日:')]/following-sibling::td/text()"
)[0].lstrip("\n")
except:
pass
return result.replace("/", "-")
def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
)
return result
except:
result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
)
return result
def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser())
def getCover(html, number):
cover_number = number
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
@@ -151,29 +54,11 @@ def getCover(text, number):
return result
def getDirector(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getOutline(html):
try:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0]
return result
def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser())
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", ""
)
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
"\n", ""
)
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
except:
# (TODO) handle more edge case
# print(html)
@@ -181,23 +66,8 @@ def getOutline(text):
return result
def getSeries(text):
try:
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
)[0]
return result
except:
return ""
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div></div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
@@ -232,6 +102,7 @@ def main(number):
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
]
chosen_url = ""
fanza_Crawler = ''
for url in fanza_urls:
chosen_url = url + fanza_search_number
@@ -240,6 +111,7 @@ def main(number):
urlencode({"rurl": chosen_url})
)
)
fanza_Crawler = fanzaCrawler(htmlcode)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
@@ -249,28 +121,34 @@ def main(number):
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
fanza_hinban = getNum(htmlcode)
fanza_hinban = fanza_Crawler.getFanzaString('品番:')
out_num = fanza_hinban
number_lo = number.lower()
html = etree.fromstring(htmlcode, etree.HTMLParser())
if (re.sub('-|_', '', number_lo) == fanza_hinban or
number_lo.replace('-', '00') == fanza_hinban or
number_lo.replace('-', '') + 'so' == fanza_hinban
):
out_num = number
data = {
"title": getTitle(htmlcode).strip(),
"studio": getStudio(htmlcode),
"outline": getOutline(htmlcode),
"runtime": getRuntime(htmlcode),
"director": getDirector(htmlcode) if "anime" not in chosen_url else "",
"actor": getActor(htmlcode) if "anime" not in chosen_url else "",
"release": getRelease(htmlcode),
"number": fanza_hinban,
"cover": getCover(htmlcode, fanza_hinban),
"title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
"studio": fanza_Crawler.getFanzaString('メーカー'),
"outline": getOutline(html),
"runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
"director": fanza_Crawler.getFanzaString('監督:') if "anime" not in chosen_url else "",
"actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "",
"release": getRelease(fanza_Crawler),
"number": out_num,
"cover": getCover(html, fanza_hinban),
"imagecut": 1,
"tag": getTag(htmlcode),
"tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
"extrafanart": getExtrafanart(htmlcode),
"label": getLabel(htmlcode),
"year": getYear(
getRelease(htmlcode)
), # str(re.search('\d{4}',getRelease(a)).group()),
"label": fanza_Crawler.getFanzaString('レーベル'),
"year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
"series": getSeries(htmlcode),
"series": fanza_Crawler.getFanzaString('シリーズ:'),
}
except:
data = {
@@ -314,4 +192,6 @@ def main_htmlcode(number):
if __name__ == "__main__":
# print(main("DV-1562"))
# print(main("96fad1217"))
print(main("h_173ghmt68"))
print(main("pred00251"))
print(main("MIAA-391"))
print(main("OBA-326"))

View File

@@ -4,58 +4,11 @@ import re
from lxml import etree#need install
import json
import ADC_function
from WebCrawler.crawler import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle_fc2com(htmlcode): #获取厂商
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/h3/text()')[0]
return result
def getActor_fc2com(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0]
return result
except:
return ''
def getStudio_fc2com(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')).strip(" ['']")
return result
except:
return ''
def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result
def getRelease_fc2com(htmlcode2): #
html=etree.fromstring(htmlcode2,etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()')).strip(" ['販売日 : ']").replace('/','-')
return result
def getCover_fc2com(htmlcode2): #获取厂商 #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
result = str(html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[1]/span/img/@src')).strip(" ['']")
return 'http:' + result
# def getOutline_fc2com(htmlcode2): #获取番号 #
# xpath_html = etree.fromstring(htmlcode2, etree.HTMLParser())
# path = str(xpath_html.xpath('//*[@id="top"]/div[1]/section[4]/iframe/@src')).strip(" ['']")
# html = etree.fromstring(ADC_function.get_html('https://adult.contents.fc2.com/'+path), etree.HTMLParser())
# print('https://adult.contents.fc2.com'+path)
# print(ADC_function.get_html('https://adult.contents.fc2.com'+path,cookies={'wei6H':'1'}))
# result = str(html.xpath('/html/body/div/text()')).strip(" ['']").replace("\\n",'',10000).replace("'",'',10000).replace(', ,','').strip(' ').replace('。,',',')
# return result
def getTag_fc2com(lx):
result = lx.xpath("//a[@class='tag tagTag']/text()")
return result
def getYear_fc2com(release):
try:
result = re.search('\d{4}',release).group()
return result
except:
return ''
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(htmlcode)
@@ -79,27 +32,30 @@ def getTrailer(htmlcode, number):
except:
return ''
else:
video_url = ''
return ''
def main(number):
try:
number = number.replace('FC2-', '').replace('fc2-', '')
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/')
actor = getActor_fc2com(htmlcode2)
if not actor:
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
fc2_crawler = Crawler(htmlcode2)
actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
if actor == "":
actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
strip(" ['販売日 : ']").replace('/','-')
dic = {
'title': lx.xpath('/html/head/title/text()')[0],
'studio': getStudio_fc2com(htmlcode2),
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
'title': fc2_crawler.getString('/html/head/title/text()'),
'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'year': re.findall('\d{4}',release)[0],
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
'director': getStudio_fc2com(htmlcode2),
'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'actor': actor,
'release': getRelease_fc2com(htmlcode2),
'release': release,
'number': 'FC2-' + number,
'label': '',
'cover': cover,
@@ -107,7 +63,7 @@ def main(number):
'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2, number),
'imagecut': 0,
'tag': getTag_fc2com(lx),
'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
'actor_photo': '',
'website': 'https://adult.contents.fc2.com/article/' + number + '/',
'source': 'https://adult.contents.fc2.com/article/' + number + '/',
@@ -121,6 +77,4 @@ def main(number):
return js
if __name__ == '__main__':
print(main('FC2-1787685'))
print(main('FC2-2086710'))
print(main('FC2-2182382'))

88
WebCrawler/gcolle.py Normal file
View File

@@ -0,0 +1,88 @@
import sys
sys.path.append('../')
from WebCrawler.crawler import *
from ADC_function import *
from lxml import etree
def main(number):
save_cookies = False
cookie_filename = 'gcolle.json'
try:
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
session = get_html_session(cookies=gcolle_cooikes)
number = number.upper().replace('GCOLLE-','')
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
gcolle_crawler = Crawler(htmlcode)
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
if r18_continue and r18_continue.startswith('http'):
htmlcode = session.get(r18_continue).text
gcolle_crawler = Crawler(htmlcode)
save_cookies = True
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
if number != number_html:
raise Exception('[-]gcolle.py: number not match')
if save_cookies:
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
cookies_save.parent.mkdir(parents=True, exist_ok=True)
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
# get extrafanart url
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
else:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
dic = {
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
"runtime": '',
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"number": "GCOLLE-" + str(number_html),
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"trailer": '',
"actor_photo":'',
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
"extrafanart":extrafanart,
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
"source": 'gcolle.py',
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
'无码': False,
}
# for k,v in dic.items():
# if k == 'outline':
# print(k,len(v))
# else:
# print(k,v)
# print('===============================================================')
except Exception as e:
dic = {'title':''}
if config.getInstance().debug():
print(e)
return dic
if __name__ == '__main__':
from pprint import pprint
config.getInstance().set_override("debug_mode:switch=1")
pprint(main('840724'))
pprint(main('840386'))
pprint(main('838671'))
pprint(main('814179'))
pprint(main('834255'))
pprint(main('814179'))

View File

@@ -56,9 +56,9 @@ def parse_info(soup: BeautifulSoup) -> dict:
"label": get_label(data_dic),
"studio": get_studio(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic),
"number": get_number(data_dic).upper(),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic),
"runtime": get_runtime(data_dic).replace(" minutes", ""),
"series": get_series(data_dic),
}
else:

View File

@@ -60,10 +60,10 @@ def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string)
return result
def getOutline(number, title): #获取剧情介绍 多进程并发查询
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
return getStoryline(number,title)
return getStoryline(number,title, 无码=uncensored)
def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return str(x[0]) if len(x) else ''
@@ -83,9 +83,13 @@ def getExtrafanart(htmlcode): # 获取剧照
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getUncensored(html):
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
return bool(x)
def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/ja/' + number)
w_number = number.replace('.', '-')
htmlcode = get_html('https://www.javbus.red/' + w_number)
if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found')
lx = etree.fromstring(htmlcode, etree.HTMLParser())
@@ -94,7 +98,7 @@ def main_uncensored(number):
'title': title,
'studio': getStudioJa(lx),
'year': getYear(lx),
'outline': getOutline(number, title),
'outline': getOutline(w_number, title, True),
'runtime': getRuntime(lx),
'director': getDirectorJa(lx),
'actor': getActor(lx),
@@ -106,9 +110,10 @@ def main_uncensored(number):
'label': getSeriseJa(lx),
'imagecut': 0,
# 'actor_photo': '',
'website': 'https://www.javbus.com/ja/' + number,
'website': 'https://www.javbus.red/' + w_number,
'source': 'javbus.py',
'series': getSeriseJa(lx),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -136,7 +141,7 @@ def main(number):
'title': title,
'studio': getStudio(lx),
'year': getYear(lx),
'outline': getOutline(number, title),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'actor': getActor(lx),
@@ -151,6 +156,7 @@ def main(number):
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
'series': getSerise(lx),
'无码': getUncensored(lx)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -168,13 +174,14 @@ def main(number):
return js
if __name__ == "__main__" :
config.G_conf_override['debug_mode:switch'] = True
print(main('ABP-888'))
print(main('ABP-960'))
print(main('ADV-R0624')) # 404
print(main('MMNT-010'))
print(main('ipx-292'))
print(main('CEMD-011'))
print(main('CJOD-278'))
config.getInstance().set_override("debug_mode:switch=1")
# print(main('ABP-888'))
# print(main('ABP-960'))
# print(main('ADV-R0624')) # 404
# print(main('MMNT-010'))
# print(main('ipx-292'))
# print(main('CEMD-011'))
# print(main('CJOD-278'))
print(main('BrazzersExxtra.21.02.01'))
print(main('100221_001'))
print(main('AVSW-061'))

View File

@@ -166,12 +166,23 @@ def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number, title): #获取剧情介绍 多进程并发查询
return getStoryline(number,title)
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
return getStoryline(number, title, 无码=uncensored)
def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getUserRating(html):
try:
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0]), int(v[0][1])
except:
return
def getUncensored(html):
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
' or contains(@href,"/tags/western?")]')
return bool(x)
def main(number):
# javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
@@ -276,7 +287,7 @@ def main(number):
'actor': getActor(lx),
'title': title,
'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(detail_page),
@@ -293,8 +304,12 @@ def main(number):
'website': urljoin('https://javdb.com', correct_url),
'source': 'javdb.py',
'series': getSeries(lx),
'无码': getUncensored(lx)
}
userrating = getUserRating(lx)
if isinstance(userrating, tuple) and len(userrating) == 2:
dic['用户评分'] = userrating[0]
dic['评分人数'] = userrating[1]
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人')
if not dic['series']:
@@ -313,18 +328,19 @@ def main(number):
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.G_conf_override['debug_mode:switch'] = True
config.getInstance().set_override("debug_mode:switch=1")
# print(main('blacked.20.05.30'))
# print(main('AGAV-042'))
# print(main('BANK-022'))
# print(main('070116-197'))
print(main('070116-197'))
# print(main('093021_539')) # 没有剧照 片商pacopacomama
#print(main('FC2-2278260'))
# print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found
#print(main('MVSD-439'))
# print(main('EHM0001')) # not found
print(main('FC2-2314275'))
#print(main('FC2-2314275'))
# print(main('EBOD-646'))
# print(main('LOVE-262'))
#print(main('ABP-890'))
print(main('ABP-890'))
print(main('blacked.14.12.08'))

View File

@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup # need install
from lxml import etree # need install
from pyquery import PyQuery as pq # need install
@@ -5,24 +7,22 @@ from ADC_function import *
import json
import re
from lib2to3.pgen2 import parse
import sys
from urllib.parse import urlparse, unquote
sys.path.append('../')
def getActorPhoto(html):
return ''
def getTitle(html, number): # 获取标题
title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
try:
result = str(re.split(r'[/||-]', title)[1])
return result.strip()
except:
return title.replace(number.upper(), '').strip()
def getTitle(html): # 获取标题
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
# <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
# <title>TM0002-我的痴女女友-麻豆社</title>
browser_title = str(html.xpath("/html/head/title/text()")[0])
title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getStudio(html): # 获取厂商 已修改
try:
@@ -61,7 +61,6 @@ def getNum(url, number): # 获取番号
filename = unquote(urlparse(url).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
print(result)
# 移除中文
if result.upper() != number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
@@ -83,13 +82,15 @@ def getSerise(html): # 获取系列 已修改
return ''
def getTag(html): # 获取标签
return html.xpath('//div[@class="article-tags"]/a/text()')
def getTag(html, studio): # 获取标签
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
def getExtrafanart(html): # 获取剧照
return ''
def cutTags(tags):
actors = []
tags = []
@@ -109,13 +110,15 @@ def main(number):
html = etree.fromstring(htmlcode, etree.HTMLParser())
url = getUrl(html)
tags = getTag(html)
actor,tags = cutTags(tags);
studio = getStudio(html)
tags = getTag(html, studio)
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定放弃尝试获取
actor = ''
dic = {
# 标题
'title': getTitle(html, number),
'title': getTitle(html),
# 制作商
'studio': getStudio(html),
'studio': studio,
# 年份
'year': getYear(html),
# 简介
@@ -143,7 +146,8 @@ def main(number):
'website': url,
'source': 'madou.py',
# 使用
'series': getSerise(html)
'series': getSerise(html),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -161,4 +165,11 @@ def main(number):
if __name__ == '__main__':
print(main('MD0094'))
config.getInstance().set_override("debug_mode:switch=1")
print(main('MD0129'))
# print(main('TM0002'))
# print(main('MD0222'))
# print(main('MD0140-2'))
# print(main('MAD039'))
# print(main('JDMY027'))

View File

@@ -5,95 +5,28 @@ from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
from WebCrawler.crawler import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a):
try:
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('//*[@id="center_column"]/div[1]/h1/text()')).strip(" ['']")
return result.replace('/', ',')
except:
return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"出演:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"出演:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','').replace('/',',')
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) #//table/tr[1]/td[1]/text()
result1=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2=str(html.xpath('//th[contains(text(),"メーカー:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1+result2).strip('+').replace("', '",'').replace('"','')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"収録時間:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"品番:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"品番:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+')
def getYear(getRelease):
try:
result = str(re.search('\d{4}',getRelease).group())
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"配信開始日:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace('/','-')
class MgsCrawler(Crawler):
def getMgsString(self, _xpath):
html = self.html
result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="EnlargeImage"]/@href')).strip(" ['']")
# result = str(html.xpath('//*[@id="center_column"]/div[1]/div[1]/div/div/h2/img/@src')).strip(" ['']")
# /html/body/div[2]/article[2]/div[1]/div[1]/div/div/h2/img/@src
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//p/text()')).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
return result
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip(
'\\n')
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getExtrafanart(htmlcode): # 获取剧照
def getExtrafanart(htmlcode2): # 获取剧照
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
html = html_pather.search(htmlcode)
html = html_pather.search(htmlcode2)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
@@ -104,36 +37,35 @@ def getExtrafanart(htmlcode): # 获取剧照
def main(number2):
number=number2.upper()
htmlcode=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode, 'lxml')
a = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode2, 'lxml')
a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode = MgsCrawler(htmlcode2)
a = MgsCrawler(a2)
b = MgsCrawler(b2)
#print(b)
try:
dic = {
'title': getTitle(htmlcode).replace("\\n", '').replace(' ', ''),
'studio': getStudio(a),
'outline': getOutline(b),
'runtime': getRuntime(a),
'director': getDirector(a),
'actor': getActor(a),
'release': getRelease(a),
'number': getNum(a),
'cover': getCover(htmlcode),
'imagecut': 1,
'tag': getTag(a),
'label': getLabel(a),
'extrafanart': getExtrafanart(htmlcode),
'year': getYear(getRelease(a)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py',
'series': getSeries(a),
}
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
dic = {
'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
'imagecut': 1,
'tag': getTag(a2),
'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
'extrafanart': getExtrafanart(htmlcode2),
'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
# str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py',
'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js

View File

@@ -5,7 +5,6 @@ import json
import builtins
from ADC_function import *
from lxml.html import fromstring
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher
from unicodedata import category
@@ -13,7 +12,7 @@ from number_parser import is_uncensored
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池')
G_mode_txt = ('顺序执行','线程池')
class noThread(object):
def map(self, fn, param):
@@ -25,14 +24,15 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None):
def getStoryline(number, title, sites: list=None, 无码=None):
start_time = time.time()
conf = config.getInstance()
if not conf.is_storyline():
return ''
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
if is_uncensored(number):
unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
if unc:
storyine_sites += conf.storyline_uncensored_site().split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
@@ -49,9 +49,8 @@ def getStoryline(number, title, sites: list=None):
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = conf.storyline_mode()
assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
run_mode = 1 if conf.storyline_mode() > 0 else 0
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args)
sel = ''
if not debug and conf.storyline_show() == 0:
@@ -62,7 +61,7 @@ def getStoryline(number, title, sites: list=None):
if not len(sel):
sel = value
return sel
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
# 以下debug结果输出会写入日志
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
sel_site = ''
for site, desc in zip(apply_sites, results):
@@ -80,34 +79,33 @@ def getStoryline(number, title, sites: list=None):
def getStoryline_mp(args):
def _inner(site, number, title, debug):
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
# 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
(site, number, title, debug) = args
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
return _inner(*args)
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
@@ -308,8 +306,8 @@ def getStoryline_amazon(q_title, number, debug):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
@@ -325,8 +323,9 @@ def getStoryline_amazon(q_title, number, debug):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
div = lx.xpath('//*[@id="productDescription"]')[0]
ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)])
p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
ama_t = ' '.join(p1) + ' '.join(p2)
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
if cookie is None:
@@ -406,10 +405,10 @@ def amazon_select_one(a_titles, q_title, number, debug):
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1
# 被采信的结果日志
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel

View File

@@ -128,7 +128,7 @@ def getOutline(html, number, title):
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site)
g = getStoryline(number, title, site, 无码=False)
if len(g):
return g
try:

View File

@@ -1,119 +1,130 @@
# 详细教程请看
# - https://github.com/yoshiko2/Movie_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini
[common]
main_mode=1
source_folder=./
failed_output_folder=failed
success_output_folder=JAV_output
soft_link=0
failed_move=1
auto_exit=0
translate_to_sc=0
multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female
del_empty_folder=1
; 跳过最近(默认:30)天新修改过的.NFO可避免整理模式(main_mode=3)和软连接(soft_link=0)时
; 反复刮削靠前的视频文件0为处理所有视频文件
nfo_skip_days=30
; 处理完多少个视频文件后停止0为处理所有视频文件
stop_counter=0
; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
ignore_failed_list=0
download_only_missing_images=1
mapping_table_validity=7
[proxy]
;proxytype: http or socks5 or socks5h switch: 0 1
switch=0
type=socks5
proxy=127.0.0.1:1080
timeout=10
retry=3
cacert_file=
[Name_Rule]
location_rule=actor+'/'+number
naming_rule=number+'-'+title
max_title_len=50
[update]
update_check=1
[priority]
website=javbus,airav,fanza,xcity,javdb,mgstage,fc2,avsox,dlsite,carib,fc2club
[escape]
literals=\()/
folders=failed,JAV_output
[debug_mode]
switch=0
; 机器翻译
[translate]
switch=0
;可选项 google-free,azure
engine=google-free
; azure翻译密钥
key=
; 翻译延迟
delay=1
values=title,outline
service_site=translate.google.cn
; 预告片
[trailer]
switch=0
; 用来确定是否是无码
[uncensored]
uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,MMDV
[media]
; 影片后缀
media_type=.mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.iso,.mpg,.m4v
; 字幕后缀
sub_type=.smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.txt,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml
; 水印
[watermark]
switch=1
water=2
; 左上 0, 右上 1, 右下 2 左下 3
; 剧照
[extrafanart]
switch=1
parallel_download=5
extrafanart_folder=extrafanart
; 剧情简介
[storyline]
switch=1
; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
; 其中airavwiki airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1 airavwiki 有码无码都能查,
; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 设置成不查询可大幅提高刮削速度。
; site=
site=1:avno1,4:airavwiki
censored_site=2:airav,5:xcity,6:amazon
uncensored_site=3:58avgo
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
run_mode=1
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
show_result=0
; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁
[cc_convert]
mode=1
vars=outline,series,studio,tag,title
[javdb]
sites=33,34
; 人脸识别 hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确需要GPU/CUDA,速度慢)
[face]
locations_model=hog
# 详细教程请看
# - https://github.com/yoshiko2/Movie_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini
[common]
main_mode=1
source_folder=./
failed_output_folder=failed
success_output_folder=JAV_output
link_mode=0
; 0: 不刮削硬链接文件 1: 刮削硬链接文件
scan_hardlink=0
failed_move=0
auto_exit=0
translate_to_sc=0
multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female
del_empty_folder=1
; 跳过最近(默认:30)天新修改过的.NFO可避免整理模式(main_mode=3)和软连接(soft_link=0)时
; 反复刮削靠前的视频文件0为处理所有视频文件
nfo_skip_days=30
; 处理完多少个视频文件后停止0为处理所有视频文件
stop_counter=0
; 再运行延迟时间单位h时m分s秒 举例: 1h30m45s(1小时30分45秒) 45(45秒)
; stop_counter不为零的条件下才有效每处理stop_counter部影片后延迟rerun_delay秒再次运行
rerun_delay=0
; 以上三个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
ignore_failed_list=0
download_only_missing_images=1
mapping_table_validity=7
[proxy]
;proxytype: http or socks5 or socks5h switch: 0 1
switch=0
type=socks5
proxy=127.0.0.1:1080
timeout=10
retry=3
cacert_file=
[Name_Rule]
location_rule=actor+'/'+number
naming_rule=number+'-'+title
max_title_len=50
[update]
update_check=1
[priority]
website=javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,fc2club,madou,mv91,javdb,gcolle
[escape]
literals=\()/
folders=failed,JAV_output
[debug_mode]
switch=0
; 机器翻译
[translate]
switch=0
;可选项 google-free,azure
engine=google-free
; azure翻译密钥
key=
; 翻译延迟
delay=1
values=title,outline
service_site=translate.google.cn
; 预告片
[trailer]
switch=0
; 用来确定是否是无码
[uncensored]
uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,MMDV
[media]
; 影片后缀
media_type=.mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.iso,.mpg,.m4v
; 字幕后缀
sub_type=.smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml
; 水印
[watermark]
switch=1
water=2
; 左上 0, 右上 1, 右下 2 左下 3
; 剧照
[extrafanart]
switch=1
parallel_download=5
extrafanart_folder=extrafanart
; 剧情简介
[storyline]
switch=1
; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
; 其中airavwiki airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1 airavwiki 有码无码都能查,
; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 设置成不查询可大幅提高刮削速度。
; site=
site=1:avno1,4:airavwiki
censored_site=2:airav,5:xcity,6:amazon
uncensored_site=3:58avgo
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
run_mode=1
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
show_result=0
; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁
[cc_convert]
mode=1
vars=outline,series,studio,tag,title
[javdb]
sites=38,39
; 人脸识别 locations_model=hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确需要GPU/CUDA,速度慢)
; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面,有码封面直接切右半部分
; aways_imagecut=0:按各网站默认行为 1:总是裁剪封面,开启此项将无视[common]download_only_missing_images=1总是覆盖封面
; 封面裁剪的宽高比可配置公式为aspect_ratio/3。默认aspect_ratio=2.12: 适配大部分有码影片封面前一版本默认为2/3即aspect_ratio=2
[face]
locations_model=hog
uncensored_only=1
aways_imagecut=0
aspect_ratio=2.12

226
config.py
View File

@@ -3,19 +3,14 @@ import re
import sys
import configparser
import time
import typing
from pathlib import Path
G_conf_override = {
# index 0 save Config() first instance for quick access by using getInstance()
0: None,
# register override config items
"common:main_mode": None,
"common:source_folder": None,
"common:auto_exit": None,
"common:nfo_skip_days": None,
"common:stop_counter": None,
"common:ignore_failed_list": None,
"debug_mode:switch": None
# no need anymore
}
@@ -74,17 +69,17 @@ class Config:
elif (Path(__file__).resolve().parent / 'config.ini').is_file():
res_path = Path(__file__).resolve().parent / 'config.ini'
if res_path is None:
sys.exit(2)
os._exit(2)
ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:")
if re.search('n', ins, re.I):
sys.exit(2)
os._exit(2)
# 用户目录才确定具有写权限,因此选择 ~/mdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的
# 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。
write_path = path_search_order[2] # Path.home() / "mdc.ini"
write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8')
print("Config file '{}' created.".format(write_path.resolve()))
input("Press Enter key exit...")
sys.exit(0)
os._exit(0)
# self.conf = self._default_config()
# try:
# self.conf = configparser.ConfigParser()
@@ -95,29 +90,86 @@ class Config:
# except Exception as e:
# print("[-]Config file not found! Use the default settings")
# print("[-]",e)
# sys.exit(3)
# os._exit(3)
# #self.conf = self._default_config()
def getboolean_override(self, section, item) -> bool:
return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(
G_conf_override[f"{section}:{item}"])
def set_override(self, option_cmd: str):
"""
通用的参数覆盖选项 -C 配置覆盖串
配置覆盖串语法:小节名:键名=值[;[小节名:]键名=值][;[小节名:]键名+=值] 多个键用分号分隔 名称可省略部分尾部字符
或 小节名:键名+=值[;[小节名:]键名=值][;[小节名:]键名+=值] 在已有值的末尾追加内容,多个键的=和+=可以交叉出现
例子: face:aspect_ratio=2;aways_imagecut=1;priority:website=javdb
小节名必须出现在开头至少一次,分号后可只出现键名=值,不再出现小节名,如果后续全部键名都属于同一个小节
例如配置文件存在两个小节[proxy][priority]那么pro可指代proxypri可指代priority
[face] ;face小节下方有4个键名locations_model= uncensored_only= aways_imagecut= aspect_ratio=
l,lo,loc,loca,locat,locati...直到locations_model完整名称都可以用来指代locations_model=键名
u,un,unc...直到uncensored_only完整名称都可以用来指代uncensored_only=键名
aw,awa...直到aways_imagecut完整名称都可以用来指代aways_imagecut=键名
as,asp...aspect_ratio完整名称都可以用来指代aspect_ratio=键名
a则因为二义性不是合法的省略键名
"""
def err_exit(str):
print(str)
os._exit(2)
def getint_override(self, section, item) -> int:
return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(
G_conf_override[f"{section}:{item}"])
def get_override(self, section, item) -> str:
return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(
G_conf_override[f"{section}:{item}"])
sections = self.conf.sections()
sec_name = None
for cmd in option_cmd.split(';'):
syntax_err = True
rex = re.findall(r'^(.*?):(.*?)(=|\+=)(.*)$', cmd, re.U)
if len(rex) and len(rex[0]) == 4:
(sec, key, assign, val) = rex[0]
sec_lo = sec.lower().strip()
key_lo = key.lower().strip()
syntax_err = False
elif sec_name: # 已经出现过一次小节名,属于同一个小节的后续键名可以省略小节名
rex = re.findall(r'^(.*?)(=|\+=)(.*)$', cmd, re.U)
if len(rex) and len(rex[0]) == 3:
(key, assign, val) = rex[0]
sec_lo = sec_name.lower()
key_lo = key.lower().strip()
syntax_err = False
if syntax_err:
err_exit(f"[-]Config override syntax incorrect. example: 'd:s=1' or 'debug_mode:switch=1'. cmd='{cmd}' all='{option_cmd}'")
if not len(sec_lo):
err_exit(f"[-]Config override Section name '{sec}' is empty! cmd='{cmd}'")
if not len(key_lo):
err_exit(f"[-]Config override Key name '{key}' is empty! cmd='{cmd}'")
if not len(val.strip()):
print(f"[!]Conig overide value '{val}' is empty! cmd='{cmd}'")
sec_name = None
for s in sections:
if not s.lower().startswith(sec_lo):
continue
if sec_name:
err_exit(f"[-]Conig overide Section short name '{sec_lo}' is not unique! dup1='{sec_name}' dup2='{s}' cmd='{cmd}'")
sec_name = s
if sec_name is None:
err_exit(f"[-]Conig overide Section name '{sec}' not found! cmd='{cmd}'")
key_name = None
keys = self.conf[sec_name]
for k in keys:
if not k.lower().startswith(key_lo):
continue
if key_name:
err_exit(f"[-]Conig overide Key short name '{key_lo}' is not unique! dup1='{key_name}' dup2='{k}' cmd='{cmd}'")
key_name = k
if key_name is None:
err_exit(f"[-]Conig overide Key name '{key}' not found! cmd='{cmd}'")
if assign == "+=":
val = keys[key_name] + val
if self.debug():
print(f"[!]Set config override [{sec_name}]{key_name}={val} by cmd='{cmd}'")
self.conf.set(sec_name, key_name, val)
def main_mode(self) -> int:
try:
return self.getint_override("common", "main_mode")
return self.conf.getint("common", "main_mode")
except ValueError:
self._exit("common:main_mode")
def source_folder(self) -> str:
return self.get_override("common", "source_folder")
return self.conf.get("common", "source_folder")
def failed_folder(self) -> str:
return self.conf.get("common", "failed_output_folder")
@@ -128,14 +180,17 @@ class Config:
def actor_gender(self) -> str:
return self.conf.get("common", "actor_gender")
def soft_link(self) -> bool:
return self.conf.getboolean("common", "soft_link")
def link_mode(self) -> int:
return self.conf.getint("common", "link_mode")
def scan_hardlink(self) -> bool:
return self.conf.getboolean("common", "scan_hardlink", fallback=False)#未找到配置选项,默认不刮削
def failed_move(self) -> bool:
return self.conf.getboolean("common", "failed_move")
def auto_exit(self) -> bool:
return self.getboolean_override("common", "auto_exit")
return self.conf.getboolean("common", "auto_exit")
def translate_to_sc(self) -> bool:
return self.conf.getboolean("common", "translate_to_sc")
@@ -147,19 +202,13 @@ class Config:
return self.conf.getboolean("common", "del_empty_folder")
def nfo_skip_days(self) -> int:
try:
return self.getint_override("common", "nfo_skip_days")
except:
return 30
return self.conf.getint("common", "nfo_skip_days", fallback=30)
def stop_counter(self) -> int:
try:
return self.getint_override("common", "stop_counter")
except:
return 0
return self.conf.getint("common", "stop_counter", fallback=0)
def ignore_failed_list(self) -> bool:
return self.getboolean_override("common", "ignore_failed_list")
return self.conf.getboolean("common", "ignore_failed_list")
def download_only_missing_images(self) -> bool:
return self.conf.getboolean("common", "download_only_missing_images")
@@ -167,6 +216,18 @@ class Config:
def mapping_table_validity(self) -> int:
return self.conf.getint("common", "mapping_table_validity")
def rerun_delay(self) -> int:
value = self.conf.get("common", "rerun_delay")
if not (isinstance(value, str) and re.match(r'^[\dsmh]+$', value, re.I)):
return 0 # not match '1h30m45s' or '30' or '1s2m1h4s5m'
if value.isnumeric() and int(value) >= 0:
return int(value)
sec = 0
sec += sum(int(v) for v in re.findall(r'(\d+)s', value, re.I))
sec += sum(int(v) for v in re.findall(r'(\d+)m', value, re.I)) * 60
sec += sum(int(v) for v in re.findall(r'(\d+)h', value, re.I)) * 3600
return sec
def is_translate(self) -> bool:
return self.conf.getboolean("translate", "switch")
@@ -243,8 +304,8 @@ class Config:
def media_type(self) -> str:
return self.conf.get('media', 'media_type')
def sub_rule(self):
return self.conf.get('media', 'sub_type').split(',')
def sub_rule(self) -> typing.Set[str]:
return set(self.conf.get('media', 'sub_type').lower().split(','))
def naming_rule(self) -> str:
return self.conf.get("Name_Rule", "naming_rule")
@@ -277,7 +338,7 @@ class Config:
return self.conf.get("escape", "folders")
def debug(self) -> bool:
return self.getboolean_override("debug_mode", "switch")
return self.conf.getboolean("debug_mode", "switch")
def is_storyline(self) -> bool:
try:
@@ -304,43 +365,34 @@ class Config:
return "3:58avgo"
def storyline_show(self) -> int:
try:
v = self.conf.getint("storyline", "show_result")
return v if v in (0, 1, 2) else 2 if v > 2 else 0
except:
return 0
v = self.conf.getint("storyline", "show_result", fallback=0)
return v if v in (0, 1, 2) else 2 if v > 2 else 0
def storyline_mode(self) -> int:
try:
v = self.conf.getint("storyline", "run_mode")
return v if v in (0, 1, 2) else 2 if v > 2 else 0
except:
return 1
return 1 if self.conf.getint("storyline", "run_mode", fallback=1) > 0 else 0
def cc_convert_mode(self) -> int:
try:
v = self.conf.getint("cc_convert", "mode")
return v if v in (0, 1, 2) else 2 if v > 2 else 0
except:
return 1
v = self.conf.getint("cc_convert", "mode", fallback=1)
return v if v in (0, 1, 2) else 2 if v > 2 else 0
def cc_convert_vars(self) -> str:
try:
return self.conf.get("cc_convert", "vars")
except:
return "actor,director,label,outline,series,studio,tag,title"
return self.conf.get("cc_convert", "vars",
fallback="actor,director,label,outline,series,studio,tag,title")
def javdb_sites(self) -> str:
try:
return self.conf.get("javdb", "sites")
except:
return "33,34"
return self.conf.get("javdb", "sites", fallback="38,39")
def face_locations_model(self) -> str:
try:
return self.conf.get("face", "locations_model")
except:
return "hog"
return self.conf.get("face", "locations_model", fallback="hog")
def face_uncensored_only(self) -> bool:
return self.conf.getboolean("face", "uncensored_only", fallback=True)
def face_aways_imagecut(self) -> bool:
return self.conf.getboolean("face", "aways_imagecut", fallback=False)
def face_aspect_ratio(self) -> float:
return self.conf.getfloat("face", "aspect_ratio", fallback=2.12)
@staticmethod
def _exit(sec: str) -> None:
@@ -358,7 +410,8 @@ class Config:
conf.set(sec1, "source_folder", "./")
conf.set(sec1, "failed_output_folder", "failed")
conf.set(sec1, "success_output_folder", "JAV_output")
conf.set(sec1, "soft_link", "0")
conf.set(sec1, "link_mode", "0")
conf.set(sec1, "scan_hardlink", "0")
conf.set(sec1, "failed_move", "1")
conf.set(sec1, "auto_exit", "0")
conf.set(sec1, "translate_to_sc", "1")
@@ -370,6 +423,7 @@ class Config:
conf.set(sec1, "ignore_failed_list", 0)
conf.set(sec1, "download_only_missing_images", 1)
conf.set(sec1, "mapping_table_validity", 7)
conf.set(sec1, "rerun_delay", 0)
sec2 = "proxy"
conf.add_section(sec2)
@@ -423,9 +477,9 @@ class Config:
sec11 = "media"
conf.add_section(sec11)
conf.set(sec11, "media_type",
".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,.MP4,.AVI,.RMVB,.WMV,.MOV,.MKV,.FLV,.TS,.WEBM,iso,ISO")
".mp4,.avi,.rmvb,.wmv,.mov,.mkv,.flv,.ts,.webm,iso")
conf.set(sec11, "sub_type",
".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.txt,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml")
".smi,.srt,.idx,.sub,.sup,.psb,.ssa,.ass,.usf,.xss,.ssf,.rt,.lrc,.sbv,.vtt,.ttml")
sec12 = "watermark"
conf.add_section(sec12)
@@ -503,8 +557,7 @@ if __name__ == "__main__":
config = Config()
mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override',
'ini_path'}
mfilter = {'conf', 'proxy', '_exit', '_default_config', 'ini_path', 'set_override'}
for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
evprint(f'config.{_m}()')
pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}
@@ -513,36 +566,13 @@ if __name__ == "__main__":
for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:
evprint(f'getInstance().proxy().{_p}')
# Override Test
G_conf_override["common:nfo_skip_days"] = 4321
G_conf_override["common:stop_counter"] = 1234
assert config.nfo_skip_days() == 4321
assert getInstance().stop_counter() == 1234
# remove override
G_conf_override["common:stop_counter"] = None
G_conf_override["common:nfo_skip_days"] = None
assert config.nfo_skip_days() != 4321
assert config.stop_counter() != 1234
# Create new instance
conf2 = Config()
assert getInstance() != conf2
assert getInstance() == config
G_conf_override["common:main_mode"] = 9
G_conf_override["common:source_folder"] = "A:/b/c"
# Override effect to all instances
assert config.main_mode() == 9
assert conf2.main_mode() == 9
assert getInstance().main_mode() == 9
assert conf2.source_folder() == "A:/b/c"
print("### Override Test ###".center(36))
evprint('getInstance().main_mode()')
evprint('config.source_folder()')
G_conf_override["common:main_mode"] = None
evprint('conf2.main_mode()')
evprint('config.main_mode()')
# unregister key acess will raise except
try:
print(G_conf_override["common:actor_gender"])
except KeyError as ke:
print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr)
conf2.set_override("d:s=1;face:asp=2;f:aw=0;pri:w=javdb;f:l=")
assert conf2.face_aspect_ratio() == 2
assert conf2.face_aways_imagecut() == False
assert conf2.sources() == "javdb"
print(f"Load Config file '{conf2.ini_path}'.")

288
core.py
View File

@@ -1,5 +1,6 @@
import json
import os.path
import os
import pathlib
import re
import shutil
@@ -10,6 +11,7 @@ from PIL import Image
from io import BytesIO
from pathlib import Path
from datetime import datetime
from lxml import etree
from ADC_function import *
from WebCrawler import get_data_from_json
@@ -27,15 +29,15 @@ def escape_path(path, escape_literals: str): # Remove escape literals
def moveFailedFolder(filepath):
conf = config.getInstance()
failed_folder = conf.failed_folder()
soft_link = conf.soft_link()
link_mode = conf.link_mode()
# 模式3或软连接改为维护一个失败列表启动扫描时加载用于排除该路径以免反复处理
# 原先的创建软连接到失败目录,并不直观,不方便找到失败文件位置,不如直接记录该文件路径
if conf.main_mode() == 3 or soft_link:
if conf.main_mode() == 3 or link_mode:
ftxt = os.path.abspath(os.path.join(failed_folder, 'failed_list.txt'))
print("[-]Add to Failed List file, see '%s'" % ftxt)
with open(ftxt, 'a', encoding='utf-8') as flt:
flt.write(f'{filepath}\n')
elif conf.failed_move() and not soft_link:
elif conf.failed_move() and not link_mode:
failed_name = os.path.join(failed_folder, os.path.basename(filepath))
mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt'))
print("'[-]Move to Failed output folder, see '%s'" % mtxt)
@@ -69,10 +71,12 @@ def get_info(json_data): # 返回json里的数据
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label
def small_cover_check(path, number, cover_small, leak_word, c_word, hack_word, filepath):
filename = f"{number}{leak_word}{c_word}{hack_word}-poster.jpg"
download_file_with_filename(cover_small, filename, path, filepath)
print('[+]Image Downloaded! ' + os.path.join(path, filename))
def small_cover_check(path, filename, cover_small, movie_path):
full_filepath = Path(path) / filename
if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(str(full_filepath)):
return
download_file_with_filename(cover_small, filename, path, movie_path)
print('[+]Image Downloaded! ' + full_filepath.name)
def create_folder(json_data): # 创建文件夹
@@ -101,7 +105,7 @@ def create_folder(json_data): # 创建文件夹
os.makedirs(path)
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
os._exit(0)
return os.path.normpath(path)
@@ -121,7 +125,7 @@ def download_file_with_filename(url, filename, path, filepath):
os.makedirs(path)
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
os._exit(0)
proxies = configProxy.proxies()
headers = {
'User-Agent': G_USER_AGENT}
@@ -138,7 +142,7 @@ def download_file_with_filename(url, filename, path, filepath):
os.makedirs(path)
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
os._exit(0)
headers = {
'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -213,7 +217,7 @@ def extrafanart_download_one_by_one(data, path, filepath):
break
if file_not_exist_or_empty(jpg_fullpath):
return
print('[+]Image Downloaded!', jpg_fullpath)
print('[+]Image Downloaded!', Path(jpg_fullpath).name)
j += 1
if conf.debug():
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
@@ -244,7 +248,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
if failed: # 非致命错误电影不移入失败文件夹将来可以用模式3补齐
print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.")
else:
print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'")
print(f"[+]Successfully downloaded {len(result)} extrafanarts.")
if conf.debug():
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
@@ -255,7 +259,7 @@ def image_ext(url):
return ".jpg"
# 封面是否下载成功否则移动到failed
def image_download(cover, fanart_path,thumb_path, path, filepath):
def image_download(cover, fanart_path, thumb_path, path, filepath):
full_filepath = os.path.join(path, fanart_path)
if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
return
@@ -273,7 +277,7 @@ def image_download(cover, fanart_path,thumb_path, path, filepath):
break
if file_not_exist_or_empty(full_filepath):
return
print('[+]Image Downloaded!', full_filepath)
print('[+]Image Downloaded!', Path(full_filepath).name)
shutil.copyfile(full_filepath, os.path.join(path, thumb_path))
@@ -289,8 +293,14 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
os.makedirs(path)
except:
print(f"[-]Fatal error! can not make folder '{path}'")
sys.exit(0)
os._exit(0)
old_nfo = None
try:
if os.path.isfile(nfo_path):
old_nfo = etree.parse(nfo_path)
except:
pass
# KODI内查看影片信息时找不到number配置naming_rule=number+'#'+title虽可解决
# 但使得标题太长放入时常为空的outline内会更适合软件给outline留出的显示版面也较大
outline = f"{number}#{outline}"
@@ -354,6 +364,41 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
print(" <premiered>" + release + "</premiered>", file=code)
print(" <releasedate>" + release + "</releasedate>", file=code)
print(" <release>" + release + "</release>", file=code)
if old_nfo:
try:
xur = old_nfo.xpath('//userrating/text()')[0]
if isinstance(xur, str) and re.match('\d+\.\d+|\d+', xur.strip()):
print(f" <userrating>{xur.strip()}</userrating>", file=code)
except:
pass
try:
f_rating = json_data['用户评分']
uc = json_data['评分人数']
print(f""" <rating>{round(f_rating * 2.0, 1)}</rating>
<criticrating>{round(f_rating * 20.0, 1)}</criticrating>
<ratings>
<rating name="javdb" max="5" default="true">
<value>{f_rating}</value>
<votes>{uc}</votes>
</rating>
</ratings>""", file=code)
except:
if old_nfo:
try:
for rtag in ('rating', 'criticrating'):
xur = old_nfo.xpath(f'//{rtag}/text()')[0]
if isinstance(xur, str) and re.match('\d+\.\d+|\d+', xur.strip()):
print(f" <{rtag}>{xur.strip()}</{rtag}>", file=code)
f_rating = old_nfo.xpath(f"//ratings/rating[@name='javdb']/value/text()")[0]
uc = old_nfo.xpath(f"//ratings/rating[@name='javdb']/votes/text()")[0]
print(f""" <ratings>
<rating name="javdb" max="5" default="true">
<value>{f_rating}</value>
<votes>{uc}</votes>
</rating>
</ratings>""", file=code)
except:
pass
print(" <cover>" + cover + "</cover>", file=code)
if config.getInstance().is_trailer():
print(" <trailer>" + trailer + "</trailer>", file=code)
@@ -462,51 +507,51 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
# ========================结束=================================
def paste_file_to_folder(filepath, path, number, leak_word, c_word, hack_word): # 文件路径,番号,后缀,要移动至的位置
def paste_file_to_folder(filepath, path, multi_part, number, part, leak_word, c_word, hack_word): # 文件路径,番号,后缀,要移动至的位置
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
try:
targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}{houzhui}")
# 任何情况下都不要覆盖以免遭遇数据源或者引擎错误导致所有文件得到同一个number逐一
# 同名覆盖致使全部文件损失且不可追回的最坏情况
if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.')
soft_link = config.getInstance().soft_link()
# 如果soft_link=1 使用软链接
if soft_link == 0:
link_mode = config.getInstance().link_mode()
# 如果link_mode 1: 建立软链接 2: 硬链接优先、无法建立硬链接再尝试软链接
# 移除原先soft_link=2的功能代码因默认记录日志已经可追溯文件来源
create_softlink = False
if link_mode not in (1, 2):
shutil.move(filepath, targetpath)
elif soft_link == 1:
elif link_mode == 2:
# 跨卷或跨盘符无法建立硬链接导致异常,回落到建立软链接
try:
os.link(filepath, targetpath, follow_symlinks=False)
except:
create_softlink = True
if link_mode == 1 or create_softlink:
# 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持
# 相对路径径,改用绝对路径方式尝试建立软链接
try:
filerelpath = os.path.relpath(filepath, path)
os.symlink(filerelpath, targetpath)
except:
os.symlink(filepath_obj.resolve(), targetpath)
elif soft_link == 2:
shutil.move(filepath, targetpath)
# 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置
# 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪
# 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改。
targetabspath = os.path.abspath(targetpath)
if targetabspath != os.path.abspath(filepath):
targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
os.symlink(targetrelpath, filepath)
sub_res = config.getInstance().sub_rule()
os.symlink(str(filepath_obj.resolve()), targetpath)
for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname))
if os.path.isfile(sub_filepath.replace(subname,".chs" + subname)):
sub_filepath = sub_filepath.replace(subname,".chs" + subname)
subname = ".chs" + subname
elif os.path.isfile(sub_filepath.replace(subname,".cht" + subname)):
sub_filepath = sub_filepath.replace(subname, ".cht" + subname)
subname = ".cht" + subname
if os.path.isfile(sub_filepath):
shutil.move(sub_filepath, os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}{subname}"))
print('[+]Sub moved!')
return True
sub_res = config.getInstance().sub_rule()
for subfile in filepath_obj.parent.glob('**/*'):
if subfile.is_file() and subfile.suffix.lower() in sub_res:
if multi_part and part.lower() not in subfile.name.lower():
continue
if filepath_obj.stem.split('.')[0].lower() != subfile.stem.split('.')[0].lower():
continue
sub_targetpath = Path(path) / f"{number}{leak_word}{c_word}{hack_word}{''.join(subfile.suffixes)}"
if link_mode not in (1, 2):
shutil.move(str(subfile), str(sub_targetpath))
print(f"[+]Sub Moved! {sub_targetpath.name}")
else:
shutil.copyfile(str(subfile), str(sub_targetpath))
print(f"[+]Sub Copied! {sub_targetpath.name}")
return
except FileExistsError as fee:
print(f'[-]FileExistsError: {fee}')
@@ -525,24 +570,39 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
number += part # 这时number会被附加上CD1后缀
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}{houzhui}")
if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.')
try:
if config.getInstance().soft_link():
os.symlink(filepath, targetpath)
else:
link_mode = config.getInstance().link_mode()
create_softlink = False
if link_mode not in (1, 2):
shutil.move(filepath, targetpath)
elif link_mode == 2:
try:
os.link(filepath, targetpath, follow_symlinks=False)
except:
create_softlink = True
if link_mode == 1 or create_softlink:
try:
filerelpath = os.path.relpath(filepath, path)
os.symlink(filerelpath, targetpath)
except:
os.symlink(str(filepath_obj.resolve()), targetpath)
sub_res = config.getInstance().sub_rule()
for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname))
if os.path.isfile(sub_filepath): # 字幕移动
shutil.move(sub_filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{hack_word}{subname}"))
print('[+]Sub moved!')
print('[!]Success')
return True
for subfile in filepath_obj.parent.glob('**/*'):
if subfile.is_file() and subfile.suffix.lower() in sub_res:
if multi_part and part.lower() not in subfile.name.lower():
continue
sub_targetpath = Path(path) / f"{number}{leak_word}{c_word}{hack_word}{''.join(subfile.suffixes)}"
if link_mode not in (1, 2):
shutil.move(str(subfile), str(sub_targetpath))
print(f"[+]Sub Moved! {sub_targetpath.name}")
else:
shutil.copyfile(str(subfile), str(sub_targetpath))
print(f"[+]Sub Copied! {sub_targetpath.name}")
return
except FileExistsError as fee:
print(f'[-]FileExistsError: {fee}')
return
@@ -554,18 +614,6 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
return
def get_part(filepath):
try:
if re.search('-CD\d+', filepath):
return re.findall('-CD\d+', filepath)[0]
if re.search('-cd\d+', filepath):
return re.findall('-cd\d+', filepath)[0]
except:
print("[-]failed!Please rename the filename again!")
moveFailedFolder(filepath)
return
def debug_print(data: json):
try:
print("[+] ------- DEBUG INFO -------")
@@ -578,14 +626,65 @@ def debug_print(data: json):
if i == 'extrafanart':
print('[+] -', "%-14s" % i, ':', len(v), 'links')
continue
print('[+] -', "%-14s" % i, ':', v)
print(f'[+] - {i:<{cnspace(i,14)}} : {v}')
print("[+] ------- DEBUG INFO -------")
except:
pass
def core_main(file_path, number_th, oCC):
def core_main_no_net_op(movie_path, number):
conf = config.getInstance()
part = ''
leak_word = ''
leak = 0
c_word = ''
cn_sub = ''
hack = ''
hack_word = ''
ext = '.jpg'
imagecut = 1
path = str(Path(movie_path).parent)
if re.search('[-_]CD\d+', movie_path, re.IGNORECASE):
part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper()
if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path,
re.I) or '中文' in movie_path or '字幕' in movie_path:
cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀
uncensored = 1 if is_uncensored(number) else 0
if '流出' in movie_path or 'uncensored' in movie_path.lower():
leak_word = '-流出' # 流出影片后缀
leak = 1
if 'hack'.upper() in str(movie_path).upper() or '破解' in movie_path:
hack = 1
hack_word = "-hack"
prestr = f"{number}{leak_word}{c_word}{hack_word}"
fanart_path = f"{prestr}-fanart{ext}"
poster_path = f"{prestr}-poster{ext}"
thumb_path = f"{prestr}-thumb{ext}"
full_fanart_path = os.path.join(path, fanart_path)
full_poster_path = os.path.join(path, poster_path)
full_thumb_path = os.path.join(path, thumb_path)
full_nfo = Path(path) / f"{prestr}{part}.nfo"
if full_nfo.is_file():
if full_nfo.read_text(encoding='utf-8').find(r'<tag>无码</tag>') >= 0:
uncensored = 1
else:
return
if not all(os.path.isfile(f) for f in (full_fanart_path, full_thumb_path)):
return
cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored))
if conf.is_watermark():
add_mark(full_poster_path, full_thumb_path, cn_sub, leak, uncensored, hack)
def core_main(movie_path, number_th, oCC):
conf = config.getInstance()
# =======================================================================初始化所需变量
multi_part = 0
@@ -597,8 +696,6 @@ def core_main(file_path, number_th, oCC):
hack = ''
hack_word = ''
filepath = file_path # 影片的路径 绝对路径
# 下面被注释的变量不需要
#rootpath= os.getcwd
number = number_th
@@ -606,7 +703,7 @@ def core_main(file_path, number_th, oCC):
# Return if blank dict returned (data not found)
if not json_data:
moveFailedFolder(filepath)
moveFailedFolder(movie_path)
return
if json_data["number"] != number:
@@ -619,25 +716,26 @@ def core_main(file_path, number_th, oCC):
imagecut = json_data.get('imagecut')
tag = json_data.get('tag')
# =======================================================================判断-C,-CD后缀
if '-CD' in filepath or '-cd' in filepath:
if re.search('[-_]CD\d+', movie_path, re.IGNORECASE):
multi_part = 1
part = get_part(filepath)
if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath:
part = re.findall('[-_]CD\d+', movie_path, re.IGNORECASE)[0].upper()
if re.search(r'[-_]C(\.\w+$|-\w+)|\d+ch(\.\w+$|-\w+)', movie_path,
re.I) or '中文' in movie_path or '字幕' in movie_path:
cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀
# 判断是否无码
uncensored = 1 if is_uncensored(number) else 0
unce = json_data.get('无码')
uncensored = int(unce) if isinstance(unce, bool) else int(is_uncensored(number))
if '流出' in filepath or 'uncensored' in filepath:
if '流出' in movie_path or 'uncensored' in movie_path.lower():
liuchu = '流出'
leak = 1
leak_word = '-流出' # 流出影片后缀
else:
leak = 0
if 'hack'.upper() in str(filepath).upper() or '破解' in filepath:
if 'hack'.upper() in str(movie_path).upper() or '破解' in movie_path:
hack = 1
hack_word = "-hack"
@@ -666,78 +764,76 @@ def core_main(file_path, number_th, oCC):
# 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, hack_word, filepath)
small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path)
# creatFolder会返回番号路径
image_download( cover, fanart_path,thumb_path, path, filepath)
image_download( cover, fanart_path,thumb_path, path, movie_path)
if not multi_part or part.lower() == '-cd1':
try:
# 下载预告片
if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, filepath)
trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, movie_path)
except:
pass
try:
# 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, number, filepath)
extrafanart_download(json_data.get('extrafanart'), path, number, movie_path)
except:
pass
# 裁剪图
cutImage(imagecut, path , fanart_path, poster_path)
cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored))
# 添加水印
if conf.is_watermark():
add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)
# 移动电影
paste_file_to_folder(filepath, path, number, leak_word, c_word, hack_word)
paste_file_to_folder(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, hack_word
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path, tag, json_data.get('actor_list'), liuchu, uncensored, hack_word
,fanart_path,poster_path,thumb_path)
elif conf.main_mode() == 2:
# 创建文件夹
path = create_folder(json_data)
# 移动文件
paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, hack_word)
paste_file_to_folder_mode2(movie_path, path, multi_part, number, part, leak_word, c_word, hack_word)
if conf.is_watermark():
add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)
elif conf.main_mode() == 3:
path = str(Path(file_path).parent)
path = str(Path(movie_path).parent)
if multi_part == 1:
number += part # 这时number会被附加上CD1后缀
# 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, hack_word, filepath)
small_cover_check(path, poster_path, json_data.get('cover_small'), movie_path)
# creatFolder会返回番号路径
image_download( cover, fanart_path,thumb_path, path, filepath)
image_download( cover, fanart_path, thumb_path, path, movie_path)
if not multi_part or part.lower() == '-cd1':
# 下载预告片
if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, filepath)
trailer_download(json_data.get('trailer'), leak_word, c_word, hack_word, number, path, movie_path)
# 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, number, filepath)
extrafanart_download(json_data.get('extrafanart'), path, number, movie_path)
# 裁剪图
cutImage(imagecut, path , fanart_path, poster_path)
cutImage(imagecut, path, fanart_path, poster_path, bool(conf.face_uncensored_only() and not uncensored))
# 添加水印
if conf.is_watermark():
add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, movie_path,
tag, json_data.get('actor_list'), liuchu, uncensored, hack_word,fanart_path,poster_path,thumb_path)

View File

@@ -2,7 +2,7 @@
main_mode=1
failed_output_folder=data/failure_output
success_output_folder=data/organized
soft_link=0
link_mode=0
[proxy]
proxy=

View File

@@ -5,8 +5,9 @@ import config
import typing
G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|"
"^hhd800\.com@|-uncensored|_uncensored|-leak|_leak|-4K|_4K",
"^\w+\.(cc|com|net|me|club|jp|tv|xyz|biz|wiki|info|tw|us|de)@|^22-sht\.me|"
"^(fhd|hd|sd|1080p|720p|4K)(-|_)|"
"(-|_)(fhd|hd|sd|1080p|720p|4K|x264|x265|uncensored|leak)",
re.IGNORECASE)
@@ -46,9 +47,13 @@ def get_number(debug: bool, file_path: str) -> str:
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
filename = re.sub("(-|_)cd\d{1,2}", "", filename, flags=re.IGNORECASE)
filename = re.sub("[-_]cd\d{1,2}", "", filename, flags=re.IGNORECASE)
if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况例如n1012-CD1.wmv
return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group())
file_number = str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE)
if re.search("\d+ch$", file_number, flags=re.I):
file_number = file_number[:-2]
return file_number.upper()
else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则
@@ -124,7 +129,8 @@ def is_uncensored(number):
):
return True
if G_cache_uncensored_conf.is_empty():
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
if G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(',')) == None:
return False
return G_cache_uncensored_conf.check(number)
@@ -146,13 +152,23 @@ if __name__ == "__main__":
"caribean-020317_001.nfo", # -号误命名为_号的
"257138_3xplanet_1Pondo_080521_001.mp4",
"ADV-R0624-CD3.wmv", # 多碟影片
"XXX-AV 22061-CD5.iso", # 支持片商格式 xxx-av-22061 命名规则来自javdb数据源
"XXX-AV 22061-CD5.iso", # 支持片商格式 xxx-av-22061 命名规则来自javdb数据源
"xxx-av 20589.mp4",
"Muramura-102114_145-HD.wmv", # 支持片商格式 102114_145 命名规则来自javdb数据源
"heydouga-4102-023-CD2.iso", # 支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
"Muramura-102114_145-HD.wmv", # 支持片商格式 102114_145 命名规则来自javdb数据源
"heydouga-4102-023-CD2.iso", # 支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
"HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
"pacopacomama-093021_539-FHD.mkv", # 支持片商格式 093021_539 命名规则来自javdb数据源
"sbw99.cc@heyzo_hd_2636_full.mp4"
"pacopacomama-093021_539-FHD.mkv", # 支持片商格式 093021_539 命名规则来自javdb数据源
"sbw99.cc@heyzo_hd_2636_full.mp4",
"hhd800.com@STARS-566-HD.mp4",
"jav20s8.com@GIGL-677_4K.mp4",
"sbw99.cc@iesp-653-4K.mp4",
"4K-ABP-358_C.mkv",
"n1012-CD1.wmv",
"[]n1012-CD2.wmv",
"rctd-460ch.mp4", # 除支持-C硬字幕外新支持ch硬字幕
"rctd-461CH-CD2.mp4", # ch后可加CDn
"rctd-461-Cd3-C.mp4", # CDn后可加-C
"rctd-461-C-cD4.mp4", # cD1 Cd1 cd1 CD1 最终生成.nfo时统一为大写CD1
)

View File

@@ -9,7 +9,7 @@ mkdir build
mkdir __pycache__
pyinstaller --onefile Movie_Data_Capture.py `
--hidden-import "ImageProcessing.hog" `
--hidden-import "ImageProcessing.cnn" `
--add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
--add-data "$OPENCC_PATH;opencc" `

View File

@@ -9,4 +9,4 @@ urllib3==1.24.3
certifi==2020.12.5
MechanicalSoup==1.1.0
opencc-python-reimplemented
face_recognition
face_recognition

View File

@@ -1,8 +1,10 @@
pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448 py38-mechanicalsoup
pip install pyquery pyinstaller
pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--hidden-import "ImageProcessing.cnn" \
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python3.8 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "$(python3.8 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \
--add-data "Img:Img" \
--add-data "config.ini:." \

View File

@@ -13,8 +13,10 @@
pip3 install -r requirements.txt
pip3 install cloudscraper==1.2.52
pyinstaller --onefile Movie_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--hidden-import "ImageProcessing.cnn" \
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "$(python3 -c 'import face_recognition_models as _; print(_.__path__[0])' | tail -n 1):face_recognition_models" \
--add-data "Img:Img" \
--add-data "config.ini:." \