Merge pull request #607 from lededev/log-3

继续完善上个月提交的新功能
This commit is contained in:
Yoshiko2
2021-10-22 00:30:38 +08:00
committed by GitHub
23 changed files with 1669 additions and 847 deletions

View File

@@ -42,6 +42,8 @@ jobs:
--hidden-import ADC_function.py \
--hidden-import core.py \
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "Img:Img" \
--add-data "config.ini:." \
- name: Build with PyInstaller for windows
if: matrix.os == 'windows-latest'
@@ -51,6 +53,8 @@ jobs:
--hidden-import ADC_function.py `
--hidden-import core.py `
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
--add-data "Img;Img" `
--add-data "config.ini;." `
- name: Copy config.ini
run: |

View File

@@ -1,8 +1,8 @@
from os import replace
import requests
import hashlib
#import hashlib
from pathlib import Path
import random
import secrets
import os.path
import uuid
import json
@@ -20,12 +20,12 @@ def getXpathSingle(htmlcode, xpath):
return result1
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
# 网页请求核心
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
verify = config.Config().cacert_file()
configProxy = config.Config().proxy()
verify = config.getInstance().cacert_file()
configProxy = config.getInstance().proxy()
errors = ""
if ua is None:
@@ -61,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None)
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
configProxy = config.Config().proxy()
configProxy = config.getInstance().proxy()
errors = ""
headers_ua = {"User-Agent": G_USER_AGENT}
if headers is None:
@@ -85,8 +85,12 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
configProxy = config.Config().proxy()
s = None
if isinstance(cookies, dict) and len(cookies):
s = requests.Session()
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
configProxy = config.getInstance().proxy()
if configProxy.enable:
browser.session.proxies = configProxy.proxies()
result = browser.open(url)
@@ -103,17 +107,19 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
return result.text
def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
if isinstance(cookies, dict):
requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
configProxy = config.Config().proxy()
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
s = None
if isinstance(cookies, dict) and len(cookies):
s = requests.Session()
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
configProxy = config.getInstance().proxy()
if configProxy.enable:
browser.session.proxies = configProxy.proxies()
result = browser.open(url)
if not result.ok:
return ''
form = browser.select_form() if form_name is None else browser.select_form(form_name)
form = browser.select_form() if form_select is None else browser.select_form(form_select)
if isinstance(fields, dict):
for k, v in fields.items():
browser[k] = v
@@ -131,7 +137,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
# def get_javlib_cookie() -> [dict, str]:
# import cloudscraper
# switch, proxy, timeout, retry_count, proxytype = config.Config().proxy()
# switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy()
# proxies = get_proxy(proxy, proxytype)
#
# raw_cookie = {}
@@ -158,7 +164,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
def translateTag_to_sc(tag):
tranlate_to_sc = config.Config().transalte_to_sc()
tranlate_to_sc = config.getInstance().transalte_to_sc()
if tranlate_to_sc:
dict_gen = {'中文字幕': '中文字幕',
'高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头',
@@ -505,8 +511,11 @@ def translate(
delay: int = 0,
):
trans_result = ""
# 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的
if not is_japanese(src):
return src
if engine == "google-free":
gsite = config.Config().get_translate_service_site()
gsite = config.getInstance().get_translate_service_site()
if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite):
gsite = 'translate.google.cn'
url = (
@@ -521,7 +530,7 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
trans_result = trans_result.join(translate_list)
# elif engine == "baidu":
# url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
# salt = random.randint(1, 1435660288)
# salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288)
# sign = app_id + src + str(salt) + key
# sign = hashlib.md5(sign.encode()).hexdigest()
# url += (
@@ -560,17 +569,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
return trans_result
# ========================================================================是否为无码
def is_uncensored(number):
if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper():
return True
configs = config.Config().get_uncensored()
prefix_list = str(configs).split(',')
for pre in prefix_list:
if pre.upper() in number.upper():
return True
return False
# 从浏览器中导出网站登录验证信息的cookies能够以会员方式打开游客无法访问到的页面
# 示例: FC2-755670 url https://javdb9.com/v/vO8Mn
# json 文件格式
@@ -593,20 +591,20 @@ def load_cookies(filename):
filename = os.path.basename(filename)
if not len(filename):
return None, None
path_search_order = [
f"./{filename}",
os.path.join(Path.home(), filename),
os.path.join(Path.home(), f".avdc/{filename}"),
os.path.join(Path.home(), f".local/share/avdc/{filename}")
]
path_search_order = (
Path.cwd() / filename,
Path.home() / filename,
Path.home() / f".avdc/{filename}",
Path.home() / f".local/share/avdc/{filename}"
)
cookies_filename = None
try:
for p in path_search_order:
if os.path.exists(p):
cookies_filename = os.path.abspath(p)
if p.is_file():
cookies_filename = str(p.resolve())
break
if not cookies_filename:
return None, None
try:
return json.load(open(cookies_filename)), cookies_filename
except:
return None, None
@@ -623,10 +621,9 @@ def file_modification_days(filename) -> int:
return 9999
return days
# 检查文件是否是链接
def is_link(filename: str):
if os.path.islink(filename):
return True # symlink
elif os.stat(filename).st_nlink > 1:
return True # hard link Linux MAC OSX Windows NTFS
return False
def file_not_exist_or_empty(filepath) -> bool:
return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0
# 日语简单检测
def is_japanese(s) -> bool:
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))

View File

@@ -6,12 +6,13 @@ import sys
import shutil
import typing
import urllib3
import signal
import config
from datetime import datetime, timedelta
import time
from pathlib import Path
from ADC_function import file_modification_days, get_html, is_link
from ADC_function import file_modification_days, get_html
from number_parser import get_number
from core import core_main, moveFailedFolder
@@ -35,30 +36,54 @@ def check_update(local_version):
def argparse_function(ver: str) -> typing.Tuple[str, str, bool]:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
conf = config.getInstance()
parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.")
# parser.add_argument("-c", "--config", default='config.ini', nargs='?', help="The config file Path.")
default_logdir = os.path.join(Path.home(),'.avlogs')
parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
# parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
default_logdir = str(Path.home() / '.avlogs')
parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?',
help=f"""Duplicate stdout and stderr to logfiles
in logging folder, default on.
default for current user: {default_logdir}
Use --log-dir= to turn off logging feature.""")
parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number")
parser.add_argument("-a", "--auto-exit", dest='autoexit', action="store_true",
help="Auto exit after program complete")
help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
default folder for current user: '{default_logdir}'. Change default folder to an empty file,
or use --log-dir= to turn log off.""")
parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.")
parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.")
parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.")
parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
parser.add_argument("-a", "--auto-exit", action="store_true",
help="Auto exit after program complete")
parser.add_argument("-g","--debug", action="store_true",
help="Turn on debug mode to generate diagnostic log for issue report.")
parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true",
help="""Only show job list of files and numbers, and **NO** actual operation
is performed. It may help you correct wrong numbers before real job.""")
parser.add_argument("-v", "--version", action="version", version=ver)
#ini_path
args = parser.parse_args()
def get_natural_number_or_none(value):
return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None
def get_str_or_none(value):
return value if isinstance(value, str) and len(value) else None
def get_bool_or_none(value):
return True if isinstance(value, bool) and value else None
config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode)
config.G_conf_override["common:source_folder"] = get_str_or_none(args.path)
config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit)
config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days)
config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt)
config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list)
config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug)
return args.file, args.path, args.number, args.autoexit, args.logdir, args.regexstr
return args.file, args.number, args.logdir, args.regexstr, args.zero_op
class OutLogger(object):
def __init__(self, logfile) -> None:
self.term = sys.stdout
self.log = open(logfile,"w",encoding='utf-8',buffering=1)
self.filepath = logfile
def __del__(self):
self.close()
def __enter__(self):
@@ -85,6 +110,7 @@ class ErrLogger(OutLogger):
def __init__(self, logfile) -> None:
self.term = sys.stderr
self.log = open(logfile,"w",encoding='utf-8',buffering=1)
self.filepath = logfile
def close(self):
if self.term != None:
sys.stderr = self.term
@@ -97,14 +123,18 @@ class ErrLogger(OutLogger):
def dupe_stdout_to_logfile(logdir: str):
if not isinstance(logdir, str) or len(logdir) == 0:
return
if not os.path.isdir(logdir):
os.makedirs(logdir)
if not os.path.isdir(logdir):
return
log_dir = Path(logdir)
if not log_dir.exists():
try:
log_dir.mkdir(parents=True,exist_ok=True)
except:
pass
if not log_dir.is_dir():
return # Tips for disabling logs by change directory to a same name empty regular file
abslog_dir = log_dir.resolve()
log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S")
logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt')
errlog = os.path.join(logdir, f'avdc_{log_tmstr}_err.txt')
logfile = abslog_dir / f'avdc_{log_tmstr}.txt'
errlog = abslog_dir / f'avdc_{log_tmstr}_err.txt'
sys.stdout = OutLogger(logfile)
sys.stderr = ErrLogger(errlog)
@@ -113,28 +143,126 @@ def dupe_stdout_to_logfile(logdir: str):
def close_logfile(logdir: str):
if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir):
return
sys.stdout.close()
sys.stderr.close()
# 清理空文件
for current_dir, subdirs, files in os.walk(logdir, topdown=False):
#日志关闭前保存日志路径
filepath = None
try:
for f in files:
full_name = os.path.join(current_dir, f)
if os.path.getsize(full_name) == 0:
os.remove(full_name)
filepath = sys.stdout.filepath
except:
pass
sys.stdout.close()
sys.stderr.close()
log_dir = Path(logdir).resolve()
if isinstance(filepath, Path):
print(f"Log file '{filepath}' saved.")
assert(filepath.parent.samefile(log_dir))
# 清理空文件
for f in log_dir.glob(r'*_err.txt'):
if f.stat().st_size == 0:
try:
f.unlink(missing_ok=True)
except:
pass
# 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志,
# 按月合并为单个月志去年及以前的月志今年4月以后将之按年合并为年志
# 测试步骤:
"""
LOGDIR=/tmp/avlog
mkdir -p $LOGDIR
for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done
for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done
for f in {00..23};do;echo 20211001T$f>$LOGDIR/avdc_20211001T${f}5959.txt;done
echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR"
# 1932 files in /tmp/avlog
avdc -zgic1 -d0 -m3 -o $LOGDIR
# python3 ./AV_Data_Capture.py -zgic1 -o $LOGDIR
ls $LOGDIR
# rm -rf $LOGDIR
"""
today = datetime.today()
# 第一步合并到日。3天前的日志文件名是同一天的合并为一份日志
for i in range(1):
txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}T\d{6}$', f.stem, re.A)]
if not txts or not len(txts):
break
e = [f for f in txts if '_err' in f.stem]
txts.sort()
tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99")
deadline_day = f'avdc_{tmstr_3_days_ago}'
day_merge = [f for f in txts if f.stem < deadline_day]
if not day_merge or not len(day_merge):
break
cutday = len('T235959.txt') # cut length avdc_20201201|T235959.txt
for f in day_merge:
try:
day_file_name = str(f)[:-cutday] + '.txt' # avdc_20201201.txt
with open(day_file_name, 'a', encoding='utf-8') as m:
m.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第二步,合并到月
for i in range(1): # 利用1次循环的break跳到第二步避免大块if缩进或者使用goto语法
txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}$', f.stem, re.A)]
if not txts or not len(txts):
break
txts.sort()
tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32")
deadline_month = f'avdc_{tmstr_3_month_ago}'
month_merge = [f for f in txts if f.stem < deadline_month]
if not month_merge or not len(month_merge):
break
tomonth = len('01.txt') # cut length avdc_202012|01.txt
for f in month_merge:
try:
month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt
with open(month_file_name, 'a', encoding='utf-8') as m:
m.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第三步,月合并到年
if today.month < 4:
return
mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{6}$', f.stem, re.A)]
if not mons or not len(mons):
return
mons.sort()
deadline_year = f'avdc_{today.year-1}13'
year_merge = [f for f in mons if f.stem < deadline_year]
if not year_merge or not len(year_merge):
return
toyear = len('12.txt') # cut length avdc_2020|12.txt
for f in year_merge:
try:
year_file_name = str(f)[:-toyear] + '.txt' # avdc_2020.txt
with open(year_file_name, 'a', encoding='utf-8') as y:
y.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第四步,压缩年志 如果有压缩需求请自行手工压缩或者使用外部脚本来定时完成。推荐nongnu的lzip对于
# 这种粒度的文本日志压缩比是目前最好的。lzip -9的运行参数下日志压缩比要高于xz -9而且内存占用更少
# 多核利用率更高(plzip多线程版本)解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右,
# 100MB的日志文件能缩小到3.7MB。
# 重写视频文件扫描,消除递归,取消全局变量,新增失败文件列表跳过处理
def movie_lists(root, conf, regexstr):
escape_folder = re.split("[,]", conf.escape_folder())
def signal_handler(*args):
print('[!]Ctrl+C detected, Exit.')
sys.exit(9)
def sigdebug_handler(*args):
config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"]
print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF'))
# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理提示跳过视频总数调试模式(-g)下详细被跳过文件,跳过小广告
def movie_lists(source_folder, regexstr):
conf = config.getInstance()
main_mode = conf.main_mode()
debug = conf.debug()
nfo_skip_days = conf.nfo_skip_days()
soft_link = conf.soft_link()
total = []
file_type = conf.media_type().upper().split(",")
file_type = conf.media_type().lower().split(",")
trailerRE = re.compile(r'-trailer\.', re.IGNORECASE)
cliRE = None
if isinstance(regexstr, str) and len(regexstr):
@@ -142,72 +270,94 @@ def movie_lists(root, conf, regexstr):
cliRE = re.compile(regexstr, re.IGNORECASE)
except:
pass
failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt'
failed_set = set()
if main_mode == 3 or soft_link:
if (main_mode == 3 or soft_link) and not conf.ignore_failed_list():
try:
with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'r', encoding='utf-8') as flt:
flist = flt.read().splitlines()
flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines()
failed_set = set(flist)
flt.close()
if len(flist) != len(failed_set):
with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'w', encoding='utf-8') as flt:
flt.writelines([line + '\n' for line in failed_set])
flt.close()
if len(flist) != len(failed_set): # 检查去重并写回但是不改变failed_list.txt内条目的先后次序重复的只保留最后的
fset = failed_set.copy()
for i in range(len(flist)-1, -1, -1):
fset.remove(flist[i]) if flist[i] in fset else flist.pop(i)
failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8')
assert len(fset) == 0 and len(flist) == len(failed_set)
except:
pass
for current_dir, subdirs, files in os.walk(root, topdown=False):
if len(set(current_dir.replace("\\","/").split("/")) & set(escape_folder)) > 0:
if not Path(source_folder).is_dir():
print('[-]Source folder not found!')
return []
total = []
source = Path(source_folder).resolve()
skip_failed_cnt, skip_nfo_days_cnt = 0, 0
escape_folder_set = set(re.split("[,]", conf.escape_folder()))
for full_name in source.glob(r'**/*'):
if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
continue
for f in files:
full_name = os.path.join(current_dir, f)
if not os.path.splitext(full_name)[1].upper() in file_type:
if not full_name.suffix.lower() in file_type:
continue
absf = os.path.abspath(full_name)
absf = str(full_name)
if absf in failed_set:
skip_failed_cnt += 1
if debug:
print('[!]Skip failed file:', absf)
print('[!]Skip failed movie:', absf)
continue
if cliRE and not cliRE.search(absf):
is_sym = full_name.is_symlink()
if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标
continue # file is symlink or hardlink(Linux/NTFS/Darwin)
# 调试用0字节样本允许通过去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size直接赋0跳过小视频检测
if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120
continue
if main_mode == 3 and nfo_skip_days > 0:
nfo = Path(absf).with_suffix('.nfo')
if file_modification_days(nfo) <= nfo_skip_days:
if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
continue
if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days:
skip_nfo_days_cnt += 1
if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
continue
if (main_mode == 3 or not is_link(absf)) and not trailerRE.search(f):
total.append(absf)
if skip_failed_cnt:
print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.")
if skip_nfo_days_cnt:
print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
if nfo_skip_days <= 0 or not soft_link or main_mode == 3:
return total
# 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的
skip_numbers = set()
success_folder = conf.success_folder()
for current_dir, subdirs, files in os.walk(success_folder, topdown=False):
for f in files:
f_obj = Path(f)
if f_obj.suffix.lower() != '.nfo':
success_folder = Path(conf.success_folder()).resolve()
for f in success_folder.glob(r'**/*'):
if not re.match(r'\.nfo', f.suffix, re.IGNORECASE):
continue
if file_modification_days(Path(current_dir) / f_obj) > nfo_skip_days:
if file_modification_days(f) > nfo_skip_days:
continue
number = get_number(False, f_obj.stem)
if number:
skip_numbers.add(number.upper())
number = get_number(False, f.stem)
if not number:
continue
skip_numbers.add(number.lower())
rm_list = []
for f in total:
n_number = get_number(False, os.path.basename(f))
if n_number and n_number.upper() in skip_numbers:
if n_number and n_number.lower() in skip_numbers:
rm_list.append(f)
for f in rm_list:
total.remove(f)
if debug:
print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'")
if len(rm_list):
print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.")
return total
def create_failed_folder(failed_folder):
if not os.path.isdir(failed_folder): # 新建failed文件夹
if not os.path.exists(failed_folder): # 新建failed文件夹
try:
os.makedirs(failed_folder)
if not os.path.isdir(failed_folder):
raise
except:
print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)")
print(f"[-]Fatal error! Can not make folder '{failed_folder}'")
sys.exit(0)
@@ -227,24 +377,29 @@ def rm_empty_folder(path):
pass
def create_data_and_move(file_path: str, c: config.Config, debug):
def create_data_and_move(file_path: str, zero_op):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
file_name = os.path.basename(file_path)
n_number = get_number(debug, file_name)
debug = config.getInstance().debug()
n_number = get_number(debug, os.path.basename(file_path))
file_path = os.path.abspath(file_path)
if debug == True:
print(f"[!]Making Data for [{file_path}], the number is [{n_number}]")
print(f"[!] [{n_number}] As Number making data for '{file_path}'")
if zero_op:
return
if n_number:
core_main(file_path, n_number, c)
core_main(file_path, n_number)
else:
print("[-] number empty ERROR")
moveFailedFolder(file_path)
print("[*]======================================================")
else:
try:
print(f"[!]Making Data for [{file_path}], the number is [{n_number}]")
print(f"[!] [{n_number}] As Number making data for '{file_path}'")
if zero_op:
return
if n_number:
core_main(file_path, n_number, c)
core_main(file_path, n_number)
else:
raise ValueError("number empty")
print("[*]======================================================")
@@ -253,22 +408,26 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
print('[-]', err)
try:
moveFailedFolder(file_path, conf)
moveFailedFolder(file_path)
except Exception as err:
print('[!]', err)
def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number):
def create_data_and_move_with_custom_number(file_path: str, custom_number):
conf = config.getInstance()
file_name = os.path.basename(file_path)
try:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, custom_number))
core_main(file_path, custom_number, c)
print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
if custom_number:
core_main(file_path, custom_number)
else:
print("[-] number empty ERROR")
print("[*]======================================================")
except Exception as err:
print("[-] [{}] ERROR:".format(file_path))
print('[-]', err)
if c.soft_link():
if conf.soft_link():
print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, os.path.join(conf.failed_folder(), file_name))
else:
@@ -279,12 +438,26 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
print('[!]', err)
if __name__ == '__main__':
def main():
version = '5.0.1'
urllib3.disable_warnings() #Ignore http proxy warning
# Parse command line args
single_file_path, folder_path, custom_number, auto_exit, logdir, regexstr = argparse_function(version)
# Read config.ini first, in argparse_function() need conf.failed_folder()
conf = config.Config("config.ini")
# Parse command line args
single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version)
main_mode = conf.main_mode()
if not main_mode in (1, 2, 3):
print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
sys.exit(4)
signal.signal(signal.SIGINT, signal_handler)
if sys.platform == 'win32':
signal.signal(signal.SIGBREAK, sigdebug_handler)
else:
signal.signal(signal.SIGWINCH, sigdebug_handler)
dupe_stdout_to_logfile(logdir)
print('[*]================== AV Data Capture ===================')
@@ -293,55 +466,62 @@ if __name__ == '__main__':
print('[*]======================================================')
print('[*]严禁在墙内宣传本项目')
# Read config.ini
conf = config.Config("config.ini")
start_time = time.time()
print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
if conf.update_check():
check_update(version)
print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug():
print('[+]Enable debug')
if conf.soft_link():
print('[!]Enable soft link')
#print('[!]CmdLine:'," ".join(sys.argv[1:]))
if len(sys.argv)>1:
print('[!]CmdLine:'," ".join(sys.argv[1:]))
print('[+]Main Working mode ## {}: {} ## {}{}{}'
.format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1],
"" if not conf.multi_threading() else ", multi_threading on",
"" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}",
"" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}"
) if not single_file_path else ('-','Single File', '','',''))
)
create_failed_folder(conf.failed_folder())
start_time = time.time()
if not single_file_path == '': #Single File
print('[+]==================== Single File =====================')
if custom_number == '':
create_data_and_move_with_custom_number(single_file_path, conf, get_number(conf.debug(), os.path.basename(single_file_path)))
create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)))
else:
create_data_and_move_with_custom_number(single_file_path, conf, custom_number)
create_data_and_move_with_custom_number(single_file_path, custom_number)
else:
if folder_path == '':
folder_path = conf.source_folder()
if not isinstance(folder_path, str) or folder_path == '':
folder_path = os.path.abspath(".")
movie_list = movie_lists(folder_path, conf, regexstr)
movie_list = movie_lists(folder_path, regexstr)
count = 0
count_all = str(len(movie_list))
print('[+]Find', count_all, 'movies. Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
main_mode = conf.main_mode()
print('[+]Find', count_all, 'movies.')
print('[*]======================================================')
stop_count = conf.stop_counter()
if stop_count<1:
stop_count = 999999
else:
count_all = str(min(len(movie_list), stop_count))
if main_mode == 3:
print(f'[!]运行模式:**维护模式**,本程序将在处理{count_all}个视频文件后停止,如需后台执行自动退出请结合 -a 参数。')
for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
create_data_and_move(movie_path, conf, conf.debug())
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
create_data_and_move(movie_path, zero_op)
if count >= stop_count:
print("[!]Stop counter triggered!")
break
if conf.del_empty_folder():
if conf.del_empty_folder() and not zero_op:
rm_empty_folder(conf.success_folder())
rm_empty_folder(conf.failed_folder())
if len(folder_path):
@@ -353,9 +533,15 @@ if __name__ == '__main__':
" End at", time.strftime("%Y-%m-%d %H:%M:%S"))
print("[+]All finished!!!")
if not (conf.auto_exit() or auto_exit):
input("Press enter key exit, you can check the error message before you exit...")
close_logfile(logdir)
if not conf.auto_exit():
input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0)
import multiprocessing
if __name__ == '__main__':
multiprocessing.freeze_support()
main()

View File

@@ -16,7 +16,9 @@ make:
#export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1)
@echo "[+]Pyinstaller make"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "Img:Img"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "Img:Img" \
--add-data "config.ini:." \
@echo "[+]Move to bin"
if [ ! -d "./bin" ];then mkdir bin; fi

View File

@@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
return True
def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据
def get_data_from_json(file_number): # 从JSON返回元数据
"""
iterate through all services and fetch the data
"""
@@ -53,6 +53,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
"fc2club": fc2club.main
}
conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 80:
@@ -114,6 +115,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
pool.close()
pool.terminate()
@@ -125,6 +127,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
except:
break
@@ -134,6 +137,14 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
print('[-]Movie Number not found!')
return None
# 增加number严格判断避免提交任何number总是返回"本橋実来 ADZ335"这种返回number不一致的数据源故障
# 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
# 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
# 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分可考虑更换规则更新相应的number分析和抓取代码。
if str(json_data.get('number')).upper() != file_number.upper():
print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
return None
# ================================================网站规则添加结束================================================
title = json_data.get('title')
@@ -167,6 +178,10 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
while 'XXXX' in tag:
tag.remove('XXXX')
while 'xxx' in tag:
tag.remove('xxx')
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
@@ -225,6 +240,8 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
studio = studio.replace('エムズビデオグループ','Ms Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
studio = studio.replace('パコパコママ','pacopacomama')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
@@ -293,4 +310,7 @@ def special_characters_replacement(text) -> str:
replace('"', ''). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ')) # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('&lsquo;', ''). # U+02018 LEFT SINGLE QUOTATION MARK
replace('&rsquo;', ''). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&amp;', ''))

View File

@@ -6,6 +6,7 @@ from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
from WebCrawler import javbus
'''
API
@@ -17,95 +18,94 @@ API
host = 'https://www.airav.wiki'
# airav这个网站没有演员图片所以直接使用javbus的图
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'star-name'})
d={}
for i in a:
l=i.a['href']
t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser())
p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p}
d.update(p2)
return d
def getActorPhoto(javbus_json):
result = javbus_json.get('actor_photo')
if isinstance(result, dict) and len(result):
return result
return ''
def getTitle(htmlcode): #获取标题
doc = pq(htmlcode)
# h5:first-child定位第一个h5标签妈的找了好久才找到这个语法
title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-')
try:
title2 = re.sub('n\d+-','',title)
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
return title2
def getStudio(htmlcode, javbus_json): #获取厂商 已修改
# javbus如果有数据以它为准
result = javbus_json.get('studio')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode,etree.HTMLParser())
return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
def getYear(htmlcode, javbus_json): #获取年份
result = javbus_json.get('year')
if isinstance(result, str) and len(result):
return result
release = getRelease(htmlcode, javbus_json)
if len(release) != len('2000-01-01'):
return ''
return release[:4]
def getCover(htmlcode, javbus_json): #获取封面图片
result = javbus_json.get('cover')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
def getRelease(htmlcode, javbus_json): #获取出版日期
result = javbus_json.get('release')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
except:
return title
def getStudio(htmlcode): #获取厂商 已修改
html = etree.fromstring(htmlcode,etree.HTMLParser())
# 如果记录中冇导演厂商排在第4位
if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
# 如果记录中有导演厂商排在第5位
elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
else:
result = ''
return ''
return result
def getYear(htmlcode): #获取年份
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
def getRuntime(javbus_json): #获取播放时长
result = javbus_json.get('runtime')
if isinstance(result, str) and len(result):
return result
def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode)
image = doc('a.bigImage')
return urljoin("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getRuntime(htmlcode): #获取分钟 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result
def getActor(htmlcode): #获取女优
return ''
# airav女优数据库较多日文汉字姓名javbus较多日语假名因此airav优先
def getActor(htmlcode, javbus_json): #获取女优
b=[]
soup=BeautifulSoup(htmlcode,'lxml')
a=soup.find_all(attrs={'class':'star-name'})
for i in a:
b.append(i.get_text())
html = etree.fromstring(htmlcode, etree.HTMLParser())
a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
for v in a:
v = v.strip()
if len(v):
b.append(v)
if len(b):
return b
def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
result = javbus_json.get('actor')
if isinstance(result, list) and len(result):
return result
def getDirector(htmlcode): #获取导演 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
else:
result = '' # 记录中有可能没有导演数据
return []
def getNum(htmlcode, javbus_json): #获取番号
result = javbus_json.get('number')
if isinstance(result, str) and len(result):
return result
def getOutline(htmlcode): #获取演员
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('^\[(.*?)]', title)[0])
return result
def getDirector(javbus_json): #获取导演 已修改
result = javbus_json.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getOutline(htmlcode): #获取概述
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','')
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
return result
except:
return ''
def getSerise(htmlcode): #获取系列 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
# 如果记录中冇导演系列排在第6位
if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
# 如果记录中有导演系列排在第7位
elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
else:
result = ''
def getSerise(javbus_json): #获取系列 已修改
result = javbus_json.get('series')
if isinstance(result, str) and len(result):
return result
return ''
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
@@ -169,52 +169,50 @@ def main(number):
try:
try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number)
javbus_json = json.loads(javbus.main(number))
except:
print(number)
dic = {
# 标题可使用airav
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
# 制作商选择使用javbus
'studio': getStudio(javbus_htmlcode),
# 年份也是用javbus
'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()),
'title': getTitle(htmlcode),
# 制作商先找javbus,如果没有再找本站
'studio': getStudio(htmlcode, javbus_json),
# 年份先试javbus,如果没有再找本站
'year': getYear(htmlcode, javbus_json),
# 简介 使用 airav
'outline': getOutline(htmlcode),
# 使用javbus
'runtime': getRuntime(javbus_htmlcode),
'runtime': getRuntime(javbus_json),
# 导演 使用javbus
'director': getDirector(javbus_htmlcode),
# 作者 使用airav
'actor': getActor(javbus_htmlcode),
# 发售日使用javbus
'release': getRelease(javbus_htmlcode),
'director': getDirector(javbus_json),
# 演员 先试airav
'actor': getActor(htmlcode, javbus_json),
# 发售日先试javbus
'release': getRelease(htmlcode, javbus_json),
# 番号使用javbus
'number': getNum(javbus_htmlcode),
'number': getNum(htmlcode, javbus_json),
# 封面链接 使用javbus
'cover': getCover(javbus_htmlcode),
'cover': getCover(htmlcode, javbus_json),
# 剧照获取
'extrafanart': getExtrafanart(htmlcode),
'imagecut': 1,
# 使用 airav
'tag': getTag(htmlcode),
# 使用javbus
'label': getSerise(javbus_htmlcode),
'label': getSerise(javbus_json),
# 妈的airav不提供作者图片
'actor_photo': getActorPhoto(javbus_htmlcode),
# 'actor_photo': getActorPhoto(javbus_json),
'website': 'https://www.airav.wiki/video/' + number,
'source': 'airav.py',
# 使用javbus
'series': getSerise(javbus_htmlcode),
'series': getSerise(javbus_json)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
data = {
"title": "",
@@ -226,6 +224,6 @@ def main(number):
if __name__ == '__main__':
#print(main('ADN-188'))
print(main('ADN-188'))
print(main('CJOD-278'))
print(main('ADV-R0624')) # javbus页面返回404, airav有数据
print(main('ADN-188')) # 一人
print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名airav采用日文汉字

View File

@@ -3,50 +3,42 @@ sys.path.append('..')
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
# import sys
from WebCrawler.storyline import getStoryline
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'})
def getActorPhoto(html):
a = html.xpath('//a[@class="avatar-box"]')
d = {}
for i in a:
l = i.img['src']
t = i.span.get_text()
l = i.find('.//img').attrib['src']
t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d
def getTitle(a):
def getTitle(html):
try:
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '')
except:
return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'})
def getActor(html):
a = html.xpath('//a[@class="avatar-box"]')
d = []
for i in a:
d.append(i.span.get_text())
d.append(i.find('span').text)
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getStudio(html):
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRuntime(html):
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getLabel(html):
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getNum(html):
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1
def getYear(release):
@@ -55,28 +47,20 @@ def getYear(release):
return result
except:
return release
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRelease(html):
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getCover(html):
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result
def getCover_small(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
def getTag(a): # 获取演员
soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'genre'})
d = []
for i in a:
d.append(i.get_text())
return d
def getSeries(htmlcode):
def getTag(html):
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return x[2:] if len(x) > 2 else []
def getSeries(html):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
@@ -86,42 +70,45 @@ def main(number):
html = get_html('https://tellme.pw/avsox')
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
a = get_html(site + '/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
web = get_html("https:" + result1)
soup = BeautifulSoup(web, 'lxml')
info = str(soup.find(attrs={'class': 'row movie'}))
detail = get_html("https:" + result1)
lx = etree.fromstring(detail, etree.HTMLParser())
try:
new_number = getNum(lx)
if new_number.upper() != number.upper():
raise ValueError('number not found')
title = getTitle(lx).strip(new_number)
dic = {
'actor': getActor(web),
'title': getTitle(web).strip(getNum(web)),
'studio': getStudio(info),
'outline': '', #
'runtime': getRuntime(info),
'actor': getActor(lx),
'title': title,
'studio': getStudio(lx),
'outline': getStoryline(number, title),
'runtime': getRuntime(lx),
'director': '', #
'release': getRelease(info),
'number': getNum(info),
'cover': getCover(web),
'cover_small': getCover_small(a),
'release': getRelease(lx),
'number': new_number,
'cover': getCover(lx),
'cover_small': getCover_small(html),
'imagecut': 3,
'tag': getTag(web),
'label': getLabel(info),
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(web),
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(getRelease(lx)),
'actor_photo': getActorPhoto(lx),
'website': "https:" + result1,
'source': 'avsox.py',
'series': getSeries(info),
'series': getSeries(lx),
}
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -129,3 +116,4 @@ def main(number):
if __name__ == "__main__":
print(main('012717_472'))
print(main('1')) # got fake result raise 'number not found'

View File

@@ -1,34 +1,32 @@
import sys
sys.path.append('../')
import json
from bs4 import BeautifulSoup
from lxml import html
import re
from ADC_function import *
from WebCrawler.storyline import getStoryline
def main(number: str) -> json:
try:
caribbytes, browser = get_html_by_browser(
'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="browser")
if not caribbytes or not caribbytes.ok:
# 因演员图片功能还未使用为提速暂时注释改为用get_html()
#r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
# return_type='browser')
#if not r.ok:
# raise ValueError("page not found")
#htmlcode = str(browser.page)
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
htmlcode = htmlbyte.decode('euc-jp')
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found")
lx = html.fromstring(str(browser.page))
lx = html.fromstring(htmlcode)
title = get_title(lx)
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
raise ValueError("page info not found")
except Exception as e:
if config.Config().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
dic = {
'title': get_title(lx),
'title': title,
'studio': '加勒比',
'year': get_year(lx),
'outline': get_outline(lx),
'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx),
'director': '',
'actor': get_actor(lx),
@@ -47,14 +45,25 @@ def main(number: str) -> json:
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title)
if len(g):
return g
return o
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
@@ -114,11 +123,10 @@ def get_actor_photo(browser):
if pos<0:
continue
css = html[pos:pos+100]
p0 = css.find('background: url(')
p1 = css.find('.jpg)')
if p0<0 or p1<0:
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
p = {k: urljoin(browser.url, cssBGjpgs[0])}
o.update(p)
return o

View File

@@ -153,7 +153,7 @@ def main(number):
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
data = {
"title": "",

View File

@@ -93,6 +93,7 @@ def main(number):
actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
dic = {
'title': lx.xpath('/html/head/title/text()')[0],
'studio': getStudio_fc2com(htmlcode2),
@@ -116,7 +117,7 @@ def main(number):
'series': '',
}
except Exception as e:
if ADC_function.config.Config().debug():
if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -124,4 +125,5 @@ def main(number):
if __name__ == '__main__':
print(main('FC2-1787685'))
print(main('FC2-2086710'))

View File

@@ -103,7 +103,7 @@ def main(number):
'series': '',
}
except Exception as e:
if ADC_function.config.Config().debug():
if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')

View File

@@ -1,114 +1,76 @@
import sys
sys.path.append('../')
import re
from pyquery import PyQuery as pq#need install
from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
from WebCrawler import fanza
from WebCrawler import airav
from WebCrawler.storyline import getStoryline
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'star-name'})
def getActorPhoto(html):
actors = html.xpath('//div[@class="star-name"]/a')
d={}
for i in a:
l=i.a['href']
t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser())
for i in actors:
url=i.attrib['href']
t=i.attrib['title']
html = etree.fromstring(get_html(url), etree.HTMLParser())
p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p}
d.update(p2)
return d
def getTitle(htmlcode): #获取标题
doc = pq(htmlcode)
title=str(doc('div.container h3').text()).replace(' ','-')
try:
title2 = re.sub('n\d+-','',title)
return title2
except:
def getTitle(html): #获取标题
title = str(html.xpath('/html/head/title/text()')[0])
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
return title
def getStudio(htmlcode): #获取厂商 已修改
html = etree.fromstring(htmlcode,etree.HTMLParser())
# 如果记录中冇导演厂商排在第4位
if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
# 如果记录中有导演厂商排在第5位
elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
else:
result = ''
return result
def getYear(htmlcode): #获取年份
html = etree.fromstring(htmlcode,etree.HTMLParser())
def getStudioJa(html):
x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getStudio(html): #获取厂商
x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getYear(html): #获取年份
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
return result[:4] if len(result)>=len('2000-01-01') else ''
def getCover(html): #获取封面链接
image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
return urljoin("https://www.javbus.com", image)
def getRelease(html): #获取出版日期
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getCover(htmlcode): #获取封面链接
doc = pq(htmlcode)
image = doc('a.bigImage')
return urljoin("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getRuntime(htmlcode): #获取分钟 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getRuntime(html): #获取分钟 已修改
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result
def getActor(htmlcode): #获取女优
def getActor(html): #获取女优
b=[]
soup=BeautifulSoup(htmlcode,'lxml')
a=soup.find_all(attrs={'class':'star-name'})
for i in a:
b.append(i.get_text())
actors = html.xpath('//div[@class="star-name"]/a')
for i in actors:
b.append(i.attrib['title'])
return b
def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result
def getDirector(htmlcode): #获取导演 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
else:
result = '' # 记录中有可能没有导演数据
return result
def getCID(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
#print(htmlcode)
def getNum(html): #获取番号
kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return kwdlist[0]
def getDirectorJa(html):
x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getDirector(html): #获取导演
x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string)
return result
def getOutline(number): #获取剧情介绍
try:
response = json.loads(airav.main(number))
result = response['outline']
return result
except:
return ''
def getSerise(htmlcode): #获取系列 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
# 如果记录中冇导演系列排在第6位
if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
# 如果记录中有导演系列排在第7位
elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
else:
result = ''
return result
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'genre'})
for i in a:
if 'onmouseout' in str(i) or '多選提交' in str(i):
continue
tag.append(translateTag_to_sc(i.get_text()))
return tag
def getOutline(number, title): #获取剧情介绍 多进程并发查询
return getStoryline(number,title)
def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getSerise(html): #获取系列
x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getTag(html): # 获取标签
klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
taglist = [translateTag_to_sc(v) for v in klist[1:]]
return taglist
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(htmlcode)
@@ -117,32 +79,34 @@ def getExtrafanart(htmlcode): # 获取剧照
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/ja/' + number)
if getTitle(htmlcode) == '':
htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_'))
if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found')
lx = etree.fromstring(htmlcode, etree.HTMLParser())
title = getTitle(lx)
dic = {
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
'studio': getStudio(htmlcode),
'year': getYear(htmlcode),
'outline': getOutline(number),
'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode),
'actor': getActor(htmlcode),
'release': getRelease(htmlcode),
'number': getNum(htmlcode),
'cover': getCover(htmlcode),
'tag': getTag(htmlcode),
'title': title,
'studio': getStudioJa(lx),
'year': getYear(lx),
'outline': getOutline(number, title),
'runtime': getRuntime(lx),
'director': getDirectorJa(lx),
'actor': getActor(lx),
'release': getRelease(lx),
'number': getNum(lx),
'cover': getCover(lx),
'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
'label': getSerise(htmlcode),
'label': getSeriseJa(lx),
'imagecut': 0,
'actor_photo': '',
# 'actor_photo': '',
'website': 'https://www.javbus.com/ja/' + number,
'source': 'javbus.py',
'series': getSerise(htmlcode),
'series': getSeriseJa(lx),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -155,32 +119,36 @@ def main(number):
htmlcode = get_html('https://www.fanbus.us/' + number)
except:
htmlcode = get_html('https://www.javbus.com/' + number)
if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found')
lx = etree.fromstring(htmlcode,etree.HTMLParser())
title = getTitle(lx)
dic = {
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
'studio': getStudio(htmlcode),
'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
'outline': getOutline(number),
'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode),
'actor': getActor(htmlcode),
'release': getRelease(htmlcode),
'number': getNum(htmlcode),
'cover': getCover(htmlcode),
'title': title,
'studio': getStudio(lx),
'year': getYear(lx),
'outline': getOutline(number, title),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'actor': getActor(lx),
'release': getRelease(lx),
'number': getNum(lx),
'cover': getCover(lx),
'imagecut': 1,
'tag': getTag(htmlcode),
'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
'label': getSerise(htmlcode),
'actor_photo': getActorPhoto(htmlcode),
'label': getSerise(lx),
# 'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
'series': getSerise(htmlcode),
'series': getSerise(lx),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
return main_uncensored(number)
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
data = {
"title": "",
@@ -191,5 +159,13 @@ def main(number):
return js
if __name__ == "__main__" :
config.G_conf_override['debug_mode:switch'] = True
print(main('ABP-888'))
print(main('ABP-960'))
print(main('ADV-R0624')) # 404
print(main('MMNT-010'))
print(main('ipx-292'))
print(main('CEMD-011'))
print(main('CJOD-278'))
print(main('100221_001'))
print(main('AVSW-061'))

View File

@@ -3,25 +3,22 @@ sys.path.append('../')
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
from WebCrawler import airav
# import sys
from mechanicalsoup.stateful_browser import StatefulBrowser
from WebCrawler.storyline import getStoryline
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser())
def getTitle(html):
browser_title = str(html.xpath("/html/head/title/text()")[0])
return browser_title[:browser_title.find(' | JavDB')].strip()
def getActor(a):
html = etree.fromstring(a, etree.HTMLParser())
def getActor(html):
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
r = []
idx = 0
actor_gendor = config.Config().actor_gender()
actor_gendor = config.getInstance().actor_gender()
if not actor_gendor in ['female','male','both','all']:
actor_gendor = 'female'
for act in actors:
@@ -33,8 +30,8 @@ def getActor(a):
idx = idx + 1
return r
def getaphoto(url):
html_page = get_html(url)
def getaphoto(url, browser):
html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
img_url = img_prether.findall(html_page)
if img_url:
@@ -42,24 +39,18 @@ def getaphoto(url):
else:
return ''
def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img
actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>')
actorall = actorall_prether.findall(html)
if actorall:
actoralls = actorall[0]
actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>')
actor = actor_prether.findall(actoralls)
def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
if not actorall:
return {}
a = getActor(html)
actor_photo = {}
for i in actor:
actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0])
for i in actorall:
if i.text in a:
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
return actor_photo
else:
return {}
def getStudio(a):
def getStudio(a, html):
# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
@@ -67,23 +58,25 @@ def getStudio(a):
patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
pianshang = patherr.findall(a)
if pianshang:
result = pianshang[0]
else:
result = ""
result = pianshang[0].strip()
if len(result):
return result
# 以卖家作为工作室
try:
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
except:
result = ''
return result
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRuntime(html):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getLabel(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser())
def getNum(html):
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+')
@@ -113,8 +106,7 @@ def getRelease(a):
else:
result = ''
return result
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getTag(html):
try:
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
total = []
@@ -135,11 +127,10 @@ def getTag(a):
pass
return total
def getCover_small(a, index=0):
def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
@@ -170,66 +161,76 @@ def getTrailer(htmlcode): # 获取预告片
video_url = ''
return video_url
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getExtrafanart(html): # 获取剧照
result = []
try:
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
except:
pass
return result
def getCover(html):
try:
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number): #获取剧情介绍
def getOutline0(number): #获取剧情介绍 airav.wiki站点404函数暂时更名等无法恢复时删除
try:
response = json.loads(airav.main(number))
result = response['outline']
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
from WebCrawler.airav import getOutline as airav_getOutline
result = airav_getOutline(htmlcode)
return result
except:
pass
return ''
def getSeries(a):
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getOutline(number, title): #获取剧情介绍 多进程并发查询
return getStoryline(number,title)
def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def main(number):
javdb_site = random.choice(["javdb9", "javdb30"])
# javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
# 如果无.json文件或者超过有效期则随机选择一个站点。
javdb_sites = ["javdb31", "javdb32"]
debug = config.getInstance().debug()
try:
# if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
# pass
# else:
# number = number.upper()
number = number.upper()
cookie_json = './' + javdb_site + '.json'
javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
# 不加载过期的cookiejavdb登录界面显示为7天免登录故假定cookie有效期为7天
has_json = False
for cj in javdb_sites:
javdb_site = cj
cookie_json = javdb_site + '.json'
cookies_dict, cookies_filepath = load_cookies(cookie_json)
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
cdays = file_modification_days(cookies_filepath)
if cdays < 7:
javdb_cookies = cookies_dict
has_json = True
break
elif cdays != 9999:
print(
f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
if not has_json:
javdb_site = secrets.choice(javdb_sites)
if debug:
print(f'[!]javdb:select site {javdb_site}')
browser = None
try:
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
query_result = get_html(javdb_url, cookies=javdb_cookies)
res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
if not res.ok:
raise
query_result = res.text
except:
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies)
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -250,61 +251,74 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
raise ValueError("number not found")
correct_url = urls[0]
try:
if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive
detail_page = browser.open_relative(correct_url).text
else:
javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
except:
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
# etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用
lx = etree.fromstring(detail_page, etree.HTMLParser())
# no cut image by default
imagecut = 3
# If gray image exists ,then replace with normal cover
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
cover_small = getCover_small(query_result)
cover_small = getCover_small(html)
else:
try:
cover_small = getCover_small(query_result, index=ids.index(number))
cover_small = getCover_small(html, index=ids.index(number))
except:
# if input number is "STAR438" not "STAR-438", use first search result.
cover_small = getCover_small(query_result)
cover_small = getCover_small(html)
if 'placeholder' in cover_small:
# replace wit normal cover and cut it
imagecut = 1
cover_small = getCover(detail_page)
cover_small = getCover(lx)
dp_number = getNum(detail_page)
dp_number = getNum(lx)
if dp_number.upper() != number:
raise ValueError("number not found")
title = getTitle(detail_page)
title = getTitle(lx)
if title and dp_number:
number = dp_number
# remove duplicate title
title = title.replace(number, '').strip()
dic = {
'actor': getActor(detail_page),
'actor': getActor(lx),
'title': title,
'studio': getStudio(detail_page),
'outline': getOutline(number),
'runtime': getRuntime(detail_page),
'director': getDirector(detail_page),
'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(detail_page),
'number': number,
'cover': getCover(detail_page),
'cover': getCover(lx),
'cover_small': cover_small,
'trailer': getTrailer(detail_page),
'extrafanart': getExtrafanart(detail_page),
'extrafanart': getExtrafanart(lx),
'imagecut': imagecut,
'tag': getTag(detail_page),
'label': getLabel(detail_page),
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(detail_page),
# 'actor_photo': getActorPhoto(lx, javdb_site, browser),
'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py',
'series': getSeries(detail_page),
'series': getSeries(lx),
}
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人')
if not dic['series']:
dic['series'] = dic['studio']
if not dic['label']:
dic['label'] = dic['studio']
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -313,10 +327,18 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.G_conf_override['debug_mode:switch'] = True
# print(main('blacked.20.05.30'))
# print(main('AGAV-042'))
# print(main('BANK-022'))
print(main('FC2-735670'))
print(main('FC2-1174949')) # not found
# print(main('070116-197'))
# print(main('093021_539')) # 没有剧照 片商pacopacomama
# print(main('FC2-2278260'))
# print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found
print(main('MVSD-439'))
print(main('EHM0001')) # not found
# print(main('EHM0001')) # not found
# print(main('FC2-2314275'))
# print(main('EBOD-646'))
# print(main('LOVE-262'))
print(main('ABP-890'))

View File

@@ -137,7 +137,7 @@ def main(number2):
'series': getSeries(a),
}
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
dic = {"title": ""}

334
WebCrawler/storyline.py Normal file
View File

@@ -0,0 +1,334 @@
import sys
sys.path.append('../')
import re
import json
import builtins
from ADC_function import *
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher
from unicodedata import category
from number_parser import is_uncensored
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池')
class noThread(object):
def map(self, fn, param):
return builtins.map(fn, param)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None):
start_time = time.time()
conf = config.getInstance()
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
if is_uncensored(number):
storyine_sites += conf.storyline_uncensored_site().split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
r_dup = set()
apply_sites = []
for s in storyine_sites:
if s in G_registered_storyline_site and s not in r_dup:
apply_sites.append(s)
r_dup.add(s)
mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = conf.storyline_mode()
assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
result = pool.map(getStoryline_mp, mp_args)
result = list(result) if run_mode == 0 else result
if not debug and conf.storyline_show() == 0:
for value in result:
if isinstance(value, str) and len(value):
return value
return ''
# 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示
cnt = len(apply_sites)
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
first = True
sel = ''
for i in range(cnt):
sl = len(result[i])if isinstance(result[i], str) else 0
if sl and first:
s += f'[选中{apply_sites[i]}字数:{sl}]'
first = False
sel = result[i]
elif sl:
s += f'{apply_sites[i]}字数:{sl}'
else:
s += f'{apply_sites[i]}:空'
print(s)
return sel
def getStoryline_mp(args):
return _getStoryline_mp(*args)
# 注新进程的print()不会写入日志中将来调试修复失效数据源需直接查看标准输出issue信息需截图屏幕
def _getStoryline_mp(site, number, title, debug):
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
try:
number_up = number
site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, browser = get_html_by_browser(url, return_type='browser')
if not res.ok:
raise ValueError(f"get_html_by_browser('{url}') failed")
avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div')
if number_up not in avs.select_one('a > h3').text.upper():
raise ValueError("number not found")
detail_url = avs.select_one('a')['href']
res = browser.open_relative(detail_url)
if not res.ok:
raise ValueError(f"browser.open_relative('{detail_url}') failed")
t = browser.page.select_one('head > title').text
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
if number.upper() != airav_number:
raise ValueError(f"page number ->[{airav_number}] not match")
desc = browser.page.select_one('li.introduction > span').text.strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP getOutline_amazon Error: {e},number [{number}].")
pass
return None
def getStoryline_58avgo(number, debug):
try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
'?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
return_type = 'browser')
if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
if f'searchresults.aspx?Search={kwd}' not in browser.url:
raise ValueError("number not found")
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
link = None
for i in range(len(s)):
title = s[i].h3.text.strip()
if re.search(number, title, re.I):
link = s[i]
break;
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or 'playon.aspx' not in browser.url:
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]")
return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number},
return_type = 'browser')
if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
s = browser.page.select('div.type_movie > div > ul > li > div')
for i in range(len(s)):
title = s[i].a.h3.text.strip()
page_number = title[title.rfind(' '):].strip()
if re.search(number, page_number, re.I):
return s[i]['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
try:
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
pass
return ''
def getStoryline_amazon(q_title, number, debug):
if not isinstance(q_title, str) or not len(q_title):
return None
try:
amazon_cookie, _ = load_cookies('amazon.json')
cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
url = "https://www.amazon.co.jp/s?k=" + q_title
res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
if not res.ok:
raise ValueError("get_html_by_browser() failed")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
if isinstance(lks, list) and len(lks):
browser.follow_link(lks[0])
cookie = None
html = etree.fromstring(str(browser.page), etree.HTMLParser())
titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
if not isinstance(idx, int) or idx < 0:
raise ValueError("title and number not found")
furl = urls[idx]
r = browser.open_relative(furl)
if not r.ok:
raise ValueError("browser.open_relative()) failed.")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
if isinstance(lks, list) and len(lks):
browser.follow_link(lks[0])
cookie = None
ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
ama_t = re.sub(r'審査番号:\d+', '', ama_t)
if cookie is None:
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
ama_save = Path.home() / ".local/share/avdc/amazon.json"
ama_save.parent.mkdir(parents=True, exist_ok=True)
ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
return ama_t
except Exception as e:
if debug:
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
pass
return None
# 查货架中DVD和蓝光商品中标题相似度高的
def amazon_select_one(a_titles, q_title, number, debug):
sel = -1
ratio = 0
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
for loc in range(len(a_titles)):
t = a_titles[loc]
if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
return loc
if not re.search('DVD|Blu-ray', t, re.I):
continue
ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
findlen = 0
lastpos = -1
cnt = len(ama_t)
for c in reversed(ama_t):
cnt -= 1
pos = que_t.rfind(c)
if lastpos >= 0:
pos_near = que_t[:lastpos].rfind(c)
if pos_near < 0:
findlen = 0
lastpos = -1
ama_t = ama_t[:cnt+1]
else:
pos = pos_near
if pos < 0:
if category(c) == 'Nd':
return -1
ama_t = ama_t[:cnt]
findlen = 0
lastpos = -1
continue
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
findlen += 1
lastpos = pos
if findlen >= 4:
break
continue
findlen = 1
lastpos = pos
if findlen==0:
return -1
r = SequenceMatcher(None, ama_t, que_t).ratio()
if r > ratio:
sel = loc
ratio = r
save_t_ = ama_t
if ratio > 0.999:
break
if ratio < 0.5:
return -1
if not debug:
# 目前采信相似度高于0.9的结果
return sel if ratio >= 0.9 else -1
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1
# 被采信的结果日志
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel

View File

@@ -3,16 +3,12 @@ sys.path.append('../')
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
# import sys
from WebCrawler.storyline import getStoryline
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser())
def getTitle(html):
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
return result
@@ -43,8 +39,7 @@ def getActorPhoto(browser):
return o
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getStudio(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
except:
@@ -52,20 +47,14 @@ def getStudio(a):
return result.strip('+').replace("', '", '').replace('"', '')
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRuntime(html):
try:
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
except:
return ''
try:
return re.findall('\d+',result1)[0]
x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
return x
except:
return ''
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getLabel(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
return result
@@ -73,8 +62,7 @@ def getLabel(a):
return ''
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser())
def getNum(html):
try:
result = html.xpath('//*[@id="hinban"]/text()')[0]
return result
@@ -90,8 +78,7 @@ def getYear(getRelease):
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRelease(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
except:
@@ -102,31 +89,22 @@ def getRelease(a):
return ''
def getTag(a):
result2=[]
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
for i in result1:
i=i.replace(u'\n','')
i=i.replace(u'\t','')
if len(i):
result2.append(i)
return result2
def getTag(html):
x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else []
def getCover_small(a, index=0):
def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getCover(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
return 'https:' + result
@@ -134,8 +112,7 @@ def getCover(htmlcode):
return ''
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getDirector(html):
try:
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
return result
@@ -143,19 +120,21 @@ def getDirector(a):
return ''
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getOutline(html, number, title):
storyline_site = config.getInstance().storyline_site().split(',')
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site)
if len(g):
return g
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0]
x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
return x.replace(getNum(html), '')
except:
return ''
try:
return re.sub('\\\\\w*\d+','',result)
except:
return result
def getSeries(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getSeries(html):
try:
try:
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
@@ -181,11 +160,10 @@ def getExtrafanart(htmlcode): # 获取剧照
return s
return ''
def main(number):
try:
def open_by_browser(number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/about/',
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
@@ -193,38 +171,44 @@ def main(number):
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
detail_page = str(browser.page)
return str(browser.page), browser
def main(number):
try:
detail_page, browser = open_by_browser(number)
url = browser.url
newnum = getNum(detail_page).upper()
lx = etree.fromstring(detail_page, etree.HTMLParser())
newnum = getNum(lx).upper()
number_up = number.upper()
if newnum != number_up:
if newnum == xcity_number.upper():
if newnum == number.replace('-','').upper():
newnum = number_up
else:
raise ValueError("xcity.py: number not found")
title = getTitle(lx)
dic = {
'actor': getActor(browser),
'title': getTitle(detail_page),
'studio': getStudio(detail_page),
'outline': getOutline(detail_page),
'runtime': getRuntime(detail_page),
'director': getDirector(detail_page),
'release': getRelease(detail_page),
'title': title,
'studio': getStudio(lx),
'outline': getOutline(lx, number, title),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(lx),
'number': newnum,
'cover': getCover(detail_page),
'cover': getCover(lx),
'cover_small': '',
'extrafanart': getExtrafanart(detail_page),
'imagecut': 1,
'tag': getTag(detail_page),
'label': getLabel(detail_page),
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()),
# 'actor_photo': getActorPhoto(browser),
'website': url,
'source': 'xcity.py',
'series': getSeries(detail_page),
'series': getSeries(lx),
}
except Exception as e:
if config.Config().debug():
if config.getInstance().debug():
print(e)
dic = {"title": ""}

View File

@@ -1,12 +1,13 @@
[common]
main_mode=1
source_folder=./
failed_output_folder=failed
success_output_folder=JAV_output
soft_link=0
failed_move=1
auto_exit=0
transalte_to_sc=0
multi_threading=1
multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female
del_empty_folder=1
@@ -16,6 +17,8 @@ nfo_skip_days=30
; 处理完多少个视频文件后停止0为处理所有视频文件
stop_counter=0
; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
ignore_failed_list=0
download_only_missing_images=1
[proxy]
;proxytype: http or socks5 or socks5h switch: 0 1
@@ -62,8 +65,7 @@ switch=0
; 用来确定是否是无码
[uncensored]
uncensored_prefix=S2M,BT,LAF,SMD
uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,RED
[media]
; 影片后缀
@@ -82,3 +84,20 @@ water=2
switch=0
extrafanart_folder=extrafanart
; 剧情简介
[storyline]
; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。
; 其中airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1有码无码都能查58avgo只能查无码或者
; 流出破解马赛克的影片(此功能没使用)。
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 设置成不查询可大幅提高刮削速度。
; site=
site=avno1
censored_site=airav,xcity,amazon
uncensored_site=58avgo
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
run_mode=1
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
show_result=0

191
config.py
View File

@@ -1,33 +1,82 @@
import os
import re
import sys
import configparser
import codecs
from pathlib import Path
G_conf_override = {
# index 0 save Config() first instance for quick access by using getInstance()
0 : None,
# register override config items
"common:main_mode" : None,
"common:source_folder" : None,
"common:auto_exit" : None,
"common:nfo_skip_days" : None,
"common:stop_counter" : None,
"common:ignore_failed_list" : None,
"debug_mode:switch" : None
}
def getInstance():
if isinstance(G_conf_override[0], Config):
return G_conf_override[0]
return Config()
class Config:
def __init__(self, path: str = "config.ini"):
path_search_order = [
path,
"./config.ini",
os.path.join(Path.home(), "avdc.ini"),
os.path.join(Path.home(), ".avdc.ini"),
os.path.join(Path.home(), ".avdc/config.ini"),
os.path.join(Path.home(), ".config/avdc/config.ini")
]
path_search_order = (
Path(path),
Path.cwd() / "config.ini",
Path.home() / "avdc.ini",
Path.home() / ".avdc.ini",
Path.home() / ".avdc/config.ini",
Path.home() / ".config/avdc/config.ini"
)
ini_path = None
for p in path_search_order:
if os.path.isfile(p):
ini_path = p
if p.is_file():
ini_path = p.resolve()
break
if ini_path:
self.conf = configparser.ConfigParser()
self.ini_path = ini_path
try:
self.conf.read(ini_path, encoding="utf-8-sig")
if self.conf.read(ini_path, encoding="utf-8-sig"):
if G_conf_override[0] is None:
G_conf_override[0] = self
except:
self.conf.read(ini_path, encoding="utf-8")
if self.conf.read(ini_path, encoding="utf-8"):
if G_conf_override[0] is None:
G_conf_override[0] = self
else:
print("[-]Config file not found!")
print("ERROR: Config file not found!")
print("Please put config file into one of the following path:")
print('\n'.join([str(p.resolve()) for p in path_search_order[2:]]))
# 对于找不到配置文件的情况,还是在打包时附上对应版本的默认配置文件,有需要时为其在搜索路径中生成,
# 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来,单个执行文件就是功能完整的了,放在任何
# 执行路径下都可以放心使用。
res_path = None
# pyinstaller打包的在打包中找config.ini
if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file():
res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini'
# 脚本运行的所在位置找
elif (Path(__file__).resolve().parent / 'config.ini').is_file():
res_path = Path(__file__).resolve().parent / 'config.ini'
if res_path is None:
sys.exit(2)
ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:")
if re.search('n', ins, re.I):
sys.exit(2)
# 用户目录才确定具有写权限,因此选择 ~/avdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的
# 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。
write_path = path_search_order[2] # Path.home() / "avdc.ini"
write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8')
print("Config file '{}' created.".format(write_path.resolve()))
input("Press Enter key exit...")
sys.exit(0)
# self.conf = self._default_config()
# try:
# self.conf = configparser.ConfigParser()
@@ -40,13 +89,24 @@ class Config:
# print("[-]",e)
# sys.exit(3)
# #self.conf = self._default_config()
def getboolean_override(self, section, item) -> bool:
return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"])
def main_mode(self) -> str:
def getint_override(self, section, item) -> int:
return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"])
def get_override(self, section, item) -> str:
return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"])
def main_mode(self) -> int:
try:
return self.conf.getint("common", "main_mode")
return self.getint_override("common", "main_mode")
except ValueError:
self._exit("common:main_mode")
def source_folder(self) -> str:
return self.get_override("common", "source_folder")
def failed_folder(self) -> str:
return self.conf.get("common", "failed_output_folder")
@@ -61,7 +121,7 @@ class Config:
def failed_move(self) -> bool:
return self.conf.getboolean("common", "failed_move")
def auto_exit(self) -> bool:
return self.conf.getboolean("common", "auto_exit")
return self.getboolean_override("common", "auto_exit")
def transalte_to_sc(self) -> bool:
return self.conf.getboolean("common", "transalte_to_sc")
def multi_threading(self) -> bool:
@@ -70,14 +130,18 @@ class Config:
return self.conf.getboolean("common", "del_empty_folder")
def nfo_skip_days(self) -> int:
try:
return self.conf.getint("common", "nfo_skip_days")
return self.getint_override("common", "nfo_skip_days")
except:
return 30
def stop_counter(self) -> int:
try:
return self.conf.getint("common", "stop_counter")
return self.getint_override("common", "stop_counter")
except:
return 0
def ignore_failed_list(self) -> bool:
return self.getboolean_override("common", "ignore_failed_list")
def download_only_missing_images(self) -> bool:
return self.conf.getboolean("common", "download_only_missing_images")
def is_transalte(self) -> bool:
return self.conf.getboolean("transalte", "switch")
def is_trailer(self) -> bool:
@@ -173,7 +237,39 @@ class Config:
return self.conf.get("escape", "folders")
def debug(self) -> bool:
return self.conf.getboolean("debug_mode", "switch")
return self.getboolean_override("debug_mode", "switch")
def storyline_site(self) -> str:
try:
return self.conf.get("storyline", "site")
except:
return "avno1"
def storyline_censored_site(self) -> str:
try:
return self.conf.get("storyline", "censored_site")
except:
return "airav,xcity,amazon"
def storyline_uncensored_site(self) -> str:
try:
return self.conf.get("storyline", "uncensored_site")
except:
return "58avgo"
def storyline_show(self) -> int:
try:
v = self.conf.getint("storyline", "show_result")
return v if v in (0,1,2) else 2 if v > 2 else 0
except:
return 0
def storyline_mode(self) -> int:
try:
v = self.conf.getint("storyline", "run_mode")
return v if v in (0,1,2) else 2 if v > 2 else 0
except:
return 1
@staticmethod
def _exit(sec: str) -> None:
@@ -188,6 +284,7 @@ class Config:
sec1 = "common"
conf.add_section(sec1)
conf.set(sec1, "main_mode", "1")
conf.set(sec1, "source_folder", "./")
conf.set(sec1, "failed_output_folder", "failed")
conf.set(sec1, "success_output_folder", "JAV_output")
conf.set(sec1, "soft_link", "0")
@@ -199,6 +296,8 @@ class Config:
conf.set(sec1, "del_empty_folder", "1")
conf.set(sec1, "nfo_skip_days", 30)
conf.set(sec1, "stop_counter", 0)
conf.set(sec1, "ignore_failed_list", 0)
conf.set(sec1, "download_only_missing_images", 1)
sec2 = "proxy"
conf.add_section(sec2)
@@ -265,6 +364,14 @@ class Config:
conf.set(sec13, "switch", 1)
conf.set(sec13, "extrafanart_folder", "extrafanart")
sec14 = "storyline"
conf.add_section(sec14)
conf.set(sec14, "site", "avno1")
conf.set(sec14, "censored_site", "airav,xcity,amazon")
conf.set(sec14, "uncensored_site", "58avgo")
conf.set(sec14, "show_result", 0)
conf.set(sec14, "run_mode", 1)
return conf
@@ -308,9 +415,45 @@ if __name__ == "__main__":
code = compile(evstr, "<string>", "eval")
print('{}: "{}"'.format(evstr, eval(code)))
config = Config()
mfilter = ('conf', 'proxy', '_exit', '_default_config')
mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'}
for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
evprint(f'config.{_m}()')
pfilter = ('proxies', 'SUPPORT_PROXY_TYPE')
for _p in [p for p in dir(config.proxy()) if not p.startswith('__') and p not in pfilter]:
evprint(f'config.proxy().{_p}')
pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}
# test getInstance()
assert(getInstance() == config)
for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:
evprint(f'getInstance().proxy().{_p}')
# Override Test
G_conf_override["common:nfo_skip_days"] = 4321
G_conf_override["common:stop_counter"] = 1234
assert config.nfo_skip_days() == 4321
assert getInstance().stop_counter() == 1234
# remove override
G_conf_override["common:stop_counter"] = None
G_conf_override["common:nfo_skip_days"] = None
assert config.nfo_skip_days() != 4321
assert config.stop_counter() != 1234
# Create new instance
conf2 = Config()
assert getInstance() != conf2
assert getInstance() == config
G_conf_override["common:main_mode"] = 9
G_conf_override["common:source_folder"] = "A:/b/c"
# Override effect to all instances
assert config.main_mode() == 9
assert conf2.main_mode() == 9
assert getInstance().main_mode() == 9
assert conf2.source_folder() == "A:/b/c"
print("### Override Test ###".center(36))
evprint('getInstance().main_mode()')
evprint('config.source_folder()')
G_conf_override["common:main_mode"] = None
evprint('conf2.main_mode()')
evprint('config.main_mode()')
# unregister key acess will raise except
try:
print(G_conf_override["common:actor_gender"])
except KeyError as ke:
print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr)
print(f"Load Config file '{conf2.ini_path}'.")

263
core.py
View File

@@ -3,8 +3,6 @@ import os.path
import pathlib
import re
import shutil
import platform
import errno
import sys
from PIL import Image
@@ -14,7 +12,7 @@ from datetime import datetime
from ADC_function import *
from WebCrawler import get_data_from_json
from number_parser import is_uncensored
def escape_path(path, escape_literals: str): # Remove escape literals
backslash = '\\'
@@ -23,7 +21,8 @@ def escape_path(path, escape_literals: str): # Remove escape literals
return path
def moveFailedFolder(filepath, conf):
def moveFailedFolder(filepath):
conf = config.getInstance()
failed_folder = conf.failed_folder()
soft_link = conf.soft_link()
# 模式3或软连接改为维护一个失败列表启动扫描时加载用于排除该路径以免反复处理
@@ -33,7 +32,6 @@ def moveFailedFolder(filepath, conf):
print("[-]Add to Failed List file, see '%s'" % ftxt)
with open(ftxt, 'a', encoding='utf-8') as flt:
flt.write(f'{filepath}\n')
flt.close()
elif conf.failed_move() and not soft_link:
failed_name = os.path.join(failed_folder, os.path.basename(filepath))
mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt'))
@@ -41,8 +39,13 @@ def moveFailedFolder(filepath, conf):
with open(mtxt, 'a', encoding='utf-8') as wwibbmt:
tmstr = datetime.now().strftime("%Y-%m-%d %H:%M")
wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n')
wwibbmt.close()
try:
if os.path.exists(failed_name):
print('[-]File Exists while moving to FailedFolder')
return
shutil.move(filepath, failed_name)
except:
print('[-]File Moving to FailedFolder unsuccessful!')
def get_info(json_data): # 返回json里的数据
@@ -63,14 +66,15 @@ def get_info(json_data): # 返回json里的数据
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label
def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config.Config, filepath):
def small_cover_check(path, number, cover_small, leak_word, c_word, filepath):
filename = f"{number}{leak_word}{c_word}-poster.jpg"
download_file_with_filename(cover_small, filename, path, conf, filepath)
download_file_with_filename(cover_small, filename, path, filepath)
print('[+]Image Downloaded! ' + os.path.join(path, filename))
def create_folder(json_data, conf: config.Config): # 创建文件夹
def create_folder(json_data): # 创建文件夹
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
conf = config.getInstance()
success_folder = conf.success_folder()
actor = json_data.get('actor')
location_rule = eval(conf.location_rule(), json_data)
@@ -81,35 +85,40 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹
if 'title' in conf.location_rule() and len(title) > maxlen:
shorttitle = title[0:maxlen]
location_rule = location_rule.replace(title, shorttitle)
path = os.path.join(success_folder, location_rule).strip()
if not os.path.isdir(path):
# 当演员为空时location_rule被计算为'/number'绝对路径,导致路径连接忽略第一个路径参数,因此添加./使其始终为相对路径
path = os.path.join(success_folder, f'./{location_rule.strip()}')
if not os.path.exists(path):
path = escape_path(path, conf.escape_literals())
try:
os.makedirs(path)
if not os.path.isdir(path):
raise
except:
path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number")
path = escape_path(path, conf.escape_literals())
try:
os.makedirs(path)
return path
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
return os.path.normpath(path)
# =====================资源下载部分===========================
# path = examle:photo , video.in the Project Folder!
def download_file_with_filename(url, filename, path, conf: config.Config, filepath):
def download_file_with_filename(url, filename, path, filepath):
conf = config.getInstance()
configProxy = conf.proxy()
for i in range(configProxy.retry):
try:
if configProxy.enable:
if not os.path.isdir(path):
if not os.path.exists(path):
try:
os.makedirs(path)
if not os.path.isdir(path):
raise IOError
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
proxies = configProxy.proxies()
headers = {
'User-Agent': G_USER_AGENT}
@@ -121,10 +130,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
code.write(r.content)
return
else:
if not os.path.isdir(path):
if not os.path.exists(path):
try:
os.makedirs(path)
if not os.path.isdir(path):
raise IOError
except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
headers = {
'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -148,46 +159,50 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
print('[-]Image Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
except IOError:
print(f"[-]Create Directory '{path}' failed!")
moveFailedFolder(filepath, conf)
moveFailedFolder(filepath)
return
print('[-]Connect Failed! Please check your Proxy or Network!')
moveFailedFolder(filepath, conf)
moveFailedFolder(filepath)
return
def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: config.Config):
if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) == 'failed':
def trailer_download(trailer, leak_word, c_word, number, path, filepath):
if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) == 'failed':
return
configProxy = conf.proxy()
configProxy = config.getInstance().proxy()
for i in range(configProxy.retry):
if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0:
if file_not_exist_or_empty(path+'/' + number + leak_word + c_word + '-trailer.mp4'):
print('[!]Video Download Failed! Trying again. [{}/3]', i + 1)
download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath)
download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath)
continue
else:
break
if os.path.getsize(path + '/' + number + leak_word + c_word + '-trailer.mp4') == 0:
if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + '-trailer.mp4'):
return
print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4')
# 剧照下载成功否则移动到failed
def extrafanart_download(data, path, conf: config.Config, filepath):
def extrafanart_download(data, path, filepath):
j = 1
conf = config.getInstance()
path = os.path.join(path, conf.get_extrafanart())
configProxy = conf.proxy()
download_only_missing_images = conf.download_only_missing_images()
for url in data:
jpg_filename = f'extrafanart-{j}.jpg'
jpg_fullpath = os.path.join(path, jpg_filename)
if download_file_with_filename(url, jpg_filename, path, conf, filepath) == 'failed':
moveFailedFolder(filepath, conf)
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
continue
if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed':
moveFailedFolder(filepath)
return
configProxy = conf.proxy()
for i in range(configProxy.retry):
if os.path.getsize(jpg_fullpath) == 0:
if file_not_exist_or_empty(jpg_fullpath):
print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
download_file_with_filename(url, jpg_filename, path, conf, filepath)
download_file_with_filename(url, jpg_filename, path, filepath)
continue
else:
break
if os.path.getsize(jpg_fullpath) == 0:
if file_not_exist_or_empty(jpg_fullpath):
return
print('[+]Image Downloaded!', jpg_fullpath)
j += 1
@@ -195,39 +210,46 @@ def extrafanart_download(data, path, conf: config.Config, filepath):
# 封面是否下载成功否则移动到failed
def image_download(cover, number, leak_word, c_word, path, conf: config.Config, filepath):
def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg"
full_filepath = os.path.join(path, filename)
if download_file_with_filename(cover, filename, path, conf, filepath) == 'failed':
moveFailedFolder(filepath, conf)
if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
return
if download_file_with_filename(cover, filename, path, filepath) == 'failed':
moveFailedFolder(filepath)
return
configProxy = conf.proxy()
configProxy = config.getInstance().proxy()
for i in range(configProxy.retry):
if os.path.getsize(full_filepath) == 0:
if file_not_exist_or_empty(full_filepath):
print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
download_file_with_filename(cover, filename, path, conf, filepath)
download_file_with_filename(cover, filename, path, filepath)
continue
else:
break
if os.path.getsize(full_filepath) == 0:
if file_not_exist_or_empty(full_filepath):
return
print('[+]Image Downloaded!', full_filepath)
shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg"))
def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf):
def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored):
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
failed_folder = conf.failed_folder()
if conf.main_mode() == 3: # 模式3下由于视频文件不做任何改变.nfo文件必须和视频文件名称除后缀外完全一致KODI等软件方可支持
if config.getInstance().main_mode() == 3: # 模式3下由于视频文件不做任何改变.nfo文件必须和视频文件名称除后缀外完全一致KODI等软件方可支持
nfo_path = str(Path(filepath).with_suffix('.nfo'))
else:
nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo")
try:
if not os.path.isdir(path):
if not os.path.exists(path):
try:
os.makedirs(path)
if not os.path.isdir(path):
raise IOError
except:
print(f"[-]Fatal error! can not make folder '{path}'")
sys.exit(0)
# KODI内查看影片信息时找不到number配置naming_rule=number+'#'+title虽可解决
# 但使得标题太长放入时常为空的outline内会更适合软件给outline留出的显示版面也较大
outline = f"{number}#{outline}"
with open(nfo_path, "wt", encoding='UTF-8') as code:
print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
print("<movie>", file=code)
@@ -279,7 +301,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
print(" <num>" + number + "</num>", file=code)
print(" <premiered>" + release + "</premiered>", file=code)
print(" <cover>" + cover + "</cover>", file=code)
if config.Config().is_trailer():
if config.getInstance().is_trailer():
print(" <trailer>" + trailer + "</trailer>", file=code)
print(" <website>" + website + "</website>", file=code)
print("</movie>", file=code)
@@ -287,12 +309,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
except IOError as e:
print("[-]Write Failed!")
print("[-]", e)
moveFailedFolder(filepath, conf)
moveFailedFolder(filepath)
return
except Exception as e1:
print("[-]Write Failed!")
print("[-]", e1)
moveFailedFolder(filepath, conf)
moveFailedFolder(filepath)
return
@@ -321,7 +343,7 @@ def cutImage(imagecut, path, number, leak_word, c_word):
# leak 流出 参数值为 1 0
# uncensored 无码 参数值为 1 0
# ========================================================================加水印
def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Config):
def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored):
mark_type = ''
if cn_sub:
mark_type += ',字幕'
@@ -331,17 +353,17 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Conf
mark_type += ',无码'
if mark_type == '':
return
add_mark_thread(thumb_path, cn_sub, leak, uncensored, conf)
add_mark_thread(thumb_path, cn_sub, leak, uncensored)
print('[+]Thumb Add Mark: ' + mark_type.strip(','))
add_mark_thread(poster_path, cn_sub, leak, uncensored, conf)
add_mark_thread(poster_path, cn_sub, leak, uncensored)
print('[+]Poster Add Mark: ' + mark_type.strip(','))
def add_mark_thread(pic_path, cn_sub, leak, uncensored, conf):
def add_mark_thread(pic_path, cn_sub, leak, uncensored):
size = 14
img_pic = Image.open(pic_path)
# 获取自定义位置取余配合pos达到顺时针添加的效果
# 左上 0, 右上 1, 右下 2 左下 3
count = conf.watermark_type()
count = config.getInstance().watermark_type()
if cn_sub == 1 or cn_sub == '1':
add_to_pic(pic_path, img_pic, size, count, 1) # 添加
count = (count + 1) % 4
@@ -391,29 +413,38 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
img_pic.save(pic_path, quality=95)
# ========================结束=================================
def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config.Config): # 文件路径,番号,后缀,要移动至的位置
def paste_file_to_folder(filepath, path, number, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
try:
targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}")
# 任何情况下都不要覆盖以免遭遇数据源或者引擎错误导致所有文件得到同一个number逐一
# 同名覆盖致使全部文件损失且不可追回的最坏情况
if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.')
soft_link = config.getInstance().soft_link()
# 如果soft_link=1 使用软链接
if conf.soft_link() == 0:
if soft_link == 0:
shutil.move(filepath, targetpath)
elif conf.soft_link() == 1:
# 采用相对路径,以便网络访问时能正确打开视频
elif soft_link == 1:
# 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持
# 相对路径径,改用绝对路径方式尝试建立软链接
try:
filerelpath = os.path.relpath(filepath, path)
os.symlink(filerelpath, targetpath)
elif conf.soft_link() == 2:
except:
os.symlink(filepath_obj.resolve(), targetpath)
elif soft_link == 2:
shutil.move(filepath, targetpath)
# 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置
# 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪
# 便于手工找回文件。并将软连接文件名后缀修改,以避免再次被搜刮
# 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改
targetabspath = os.path.abspath(targetpath)
if targetabspath != os.path.abspath(filepath):
targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
os.symlink(targetrelpath, filepath + '#sym')
sub_res = conf.sub_rule()
os.symlink(targetrelpath, filepath)
sub_res = config.getInstance().sub_rule()
for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname))
@@ -422,9 +453,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
print('[+]Sub moved!')
return True
except FileExistsError:
print('[-]File Exists! Please check your movie!')
print('[-]move to the root folder of the program.')
except FileExistsError as fee:
print(f'[-]FileExistsError: {fee}')
moveFailedFolder(filepath)
return
except PermissionError:
print('[-]Error! Please run as administrator!')
@@ -434,19 +465,22 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
return
def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf): # 文件路径,番号,后缀,要移动至的位置
def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置
if multi_part == 1:
number += part # 这时number会被附加上CD1后缀
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")
if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.')
try:
if conf.soft_link():
os.symlink(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}"))
if config.getInstance().soft_link():
os.symlink(filepath, targetpath)
else:
shutil.move(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}"))
shutil.move(filepath, targetpath)
sub_res = conf.sub_rule()
sub_res = config.getInstance().sub_rule()
for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname))
if os.path.isfile(sub_filepath): # 字幕移动
@@ -454,9 +488,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print('[+]Sub moved!')
print('[!]Success')
return True
except FileExistsError:
print('[-]File Exists! Please check your movie!')
print('[-]move to the root folder of the program.')
except FileExistsError as fee:
print(f'[-]FileExistsError: {fee}')
return
except PermissionError:
print('[-]Error! Please run as administrator!')
@@ -465,7 +498,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print(f'[-]OS Error errno {oserr.errno}')
return
def get_part(filepath, conf):
def get_part(filepath):
try:
if re.search('-CD\d+', filepath):
return re.findall('-CD\d+', filepath)[0]
@@ -473,7 +506,7 @@ def get_part(filepath, conf):
return re.findall('-cd\d+', filepath)[0]
except:
print("[-]failed!Please rename the filename again!")
moveFailedFolder(filepath, conf)
moveFailedFolder(filepath)
return
@@ -493,7 +526,8 @@ def debug_print(data: json):
pass
def core_main(file_path, number_th, conf: config.Config):
def core_main(file_path, number_th):
conf = config.getInstance()
# =======================================================================初始化所需变量
multi_part = 0
part = ''
@@ -507,11 +541,11 @@ def core_main(file_path, number_th, conf: config.Config):
# 下面被注释的变量不需要
#rootpath= os.getcwd
number = number_th
json_data = get_data_from_json(number, conf) # 定义番号
json_data = get_data_from_json(number) # 定义番号
# Return if blank dict returned (data not found)
if not json_data:
moveFailedFolder(filepath, conf)
moveFailedFolder(filepath)
return
if json_data["number"] != number:
@@ -526,16 +560,13 @@ def core_main(file_path, number_th, conf: config.Config):
# =======================================================================判断-C,-CD后缀
if '-CD' in filepath or '-cd' in filepath:
multi_part = 1
part = get_part(filepath, conf)
part = get_part(filepath)
if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath:
cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀
# 判断是否无码
if is_uncensored(number):
uncensored = 1
else:
uncensored = 0
uncensored = 1 if is_uncensored(number) else 0
if '流出' in filepath or 'uncensored' in filepath:
@@ -550,7 +581,7 @@ def core_main(file_path, number_th, conf: config.Config):
debug_print(json_data)
# 创建文件夹
#path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data, conf)
#path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data)
# main_mode
# 1: 刮削模式 / Scraping mode
@@ -558,54 +589,55 @@ def core_main(file_path, number_th, conf: config.Config):
# 3不改变路径刮削
if conf.main_mode() == 1:
# 创建文件夹
path = create_folder(json_data, conf)
path = create_folder(json_data)
if multi_part == 1:
number += part # 这时number会被附加上CD1后缀
# 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath)
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
# creatFolder会返回番号路径
image_download( json_data.get('cover'), number, leak_word, c_word, path, conf, filepath)
image_download( json_data.get('cover'), number, leak_word, c_word, path, filepath)
if not multi_part or part.lower() == '-cd1':
try:
# 下载预告片
if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
except:
pass
try:
# 下载剧照 data, path, conf: config.Config, filepath
# 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
extrafanart_download(json_data.get('extrafanart'), path, filepath)
except:
pass
# 裁剪图
cutImage(imagecut, path, number, leak_word, c_word)
# 打印文件
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf)
# 移动文件
paste_file_to_folder(filepath, path, number, leak_word, c_word, conf)
# 添加水印
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark():
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
# 移动电影
paste_file_to_folder(filepath, path, number, leak_word, c_word)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored)
elif conf.main_mode() == 2:
# 创建文件夹
path = create_folder(json_data, conf)
path = create_folder(json_data)
# 移动文件
paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf)
paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word)
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark():
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
elif conf.main_mode() == 3:
path = str(Path(file_path).parent)
@@ -614,28 +646,29 @@ def core_main(file_path, number_th, conf: config.Config):
# 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath)
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
# creatFolder会返回番号路径
image_download(json_data.get('cover'), number, leak_word, c_word, path, conf, filepath)
image_download(json_data.get('cover'), number, leak_word, c_word, path, filepath)
if not multi_part or part.lower() == '-cd1':
# 下载预告片
if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
# 下载剧照 data, path, conf: config.Config, filepath
# 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
extrafanart_download(json_data.get('extrafanart'), path, filepath)
# 裁剪图
cutImage(imagecut, path, number, leak_word, c_word)
# 打印文件
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
tag, json_data.get('actor_list'), liuchu, uncensored, conf)
# 添加水印
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark():
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
tag, json_data.get('actor_list'), liuchu, uncensored)

View File

@@ -1,14 +1,14 @@
import os
import re
from core import *
import sys
import config
G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
re.IGNORECASE)
def get_number(debug,filepath: str) -> str:
def get_number(debug,file_path: str) -> str:
# """
# >>> from number_parser import get_number
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4")
@@ -32,77 +32,174 @@ def get_number(debug,filepath: str) -> str:
# >>> get_number("snis-829-C.mp4")
# 'snis-829'
# """
filepath = os.path.basename(filepath)
if debug == False:
filepath = os.path.basename(file_path)
# debug True 和 False 两块代码块合并原因是此模块及函数只涉及字符串计算没有IO操作debug on时输出导致异常信息即可
try:
if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
#filepath = filepath.replace("_", "-")
file_number = get_number_by_dict(filepath)
if file_number:
return file_number
elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
filepath = G_spat.sub("", filepath)
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
file_number = get_number_by_dict(lower_check)
if file_number:
return file_number
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
if oumei:
return oumei.group()
try:
return str(
re.findall(r'(.+?)\.',
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
"['']").replace('_', '-')
except:
return re.search(r'(.+?)\.', filepath)[0]
return str(re.search(r'(.+?)\.', filepath)[0])
except Exception as e:
print('[-]' + str(e))
return
elif debug == True:
if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
#filepath = filepath.replace("_", "-")
filepath = G_spat.sub("", filepath)
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
file_number = get_number_by_dict(lower_check)
if file_number:
return file_number
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
if oumei:
return oumei.group()
try:
return str(
re.findall(r'(.+?)\.',
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
"['']").replace('_', '-')
except:
return re.search(r'(.+?)\.', filepath)[0]
G_TAKE_NUM_RULES = {
'tokyo' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.A).group()),
'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('_', '-'),
'1pon' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('-', '_'),
'10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.A).group()).replace('-', '_'),
'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.A).group())
}
def get_number_by_dict(lower_filename: str) -> str:
for k,v in G_TAKE_NUM_RULES.items():
if k in lower_filename:
return v(lower_filename)
if debug:
print(f'[-]Number Parser exception: {e} [{file_path}]')
return None
# if __name__ == "__main__":
# 按javdb数据源的命名规范提取number
G_TAKE_NUM_RULES = {
'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
'1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
'10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0])
}
def get_number_by_dict(filename: str) -> str:
try:
for k,v in G_TAKE_NUM_RULES.items():
if re.search(k, filename, re.I):
return v(filename)
except:
pass
return None
class Cache_uncensored_conf:
prefix = None
def is_empty(self):
return bool(self.prefix is None)
def set(self, v: list):
if not v or not len(v) or not len(v[0]):
raise ValueError('input prefix list empty or None')
s = v[0]
if len(v) > 1:
for i in v[1:]:
s += f"|{i}.+"
self.prefix = re.compile(s, re.I)
def check(self, number):
if self.prefix is None:
raise ValueError('No init re compile')
return self.prefix.match(number)
G_cache_uncensored_conf = Cache_uncensored_conf()
# ========================================================================是否为无码
def is_uncensored(number):
if re.match(
r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
number,
re.I
):
return True
if G_cache_uncensored_conf.is_empty():
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
return G_cache_uncensored_conf.check(number)
if __name__ == "__main__":
# import doctest
# doctest.testmod(raise_on_error=True)
test_use_cases = (
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
"TokyoHot-n1287-HD SP2006 .mp4",
"caribean-020317_001.nfo", # -号误命名为_号的
"257138_3xplanet_1Pondo_080521_001.mp4",
"ADV-R0624-CD3.wmv", # 多碟影片
"XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源
"xxx-av 20589.mp4",
"Muramura-102114_145-HD.wmv", # 新支持片商格式 102114_145 命名规则来自javdb数据源
"heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
"HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
"pacopacomama-093021_539-FHD.mkv" # 新支持片商格式 093021_539 命名规则来自javdb数据源
)
def evprint(evstr):
code = compile(evstr, "<string>", "eval")
print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
for t in test_use_cases:
evprint(f'get_number(True, "{t}")')
if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
sys.exit(0)
# 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据参数为盘符 A .. Z 或带盘符路径
# https://www.voidtools.com/support/everything/command_line_interface/
# ES命令行工具需要Everything文件搜索引擎处于运行状态es.exe单个执行文件需放入PATH路径中。
# Everything是免费软件
# 示例:
# python.exe .\number_parser.py ALL # 从所有磁盘搜索视频
# python.exe .\number_parser.py D # 从D盘搜索
# python.exe .\number_parser.py D: # 同上
# python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录路径必须带盘符
# ==================
# Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据
# 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引
# MAC OS X 使用findutils的glocate需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引
# 示例:
# python3 ./number_parser.py ALL
import subprocess
ES_search_path = "ALL disks"
if sys.argv[1] == "ALL":
if sys.platform == "win32":
# ES_prog_path = 'C:/greensoft/es/es.exe'
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v'
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030此编码为UNICODE方言与UTF-8系全射关系无转码损失
out_list = out_text.splitlines()
elif sys.platform in ("linux", "darwin"):
ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path)
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('utf-8')
out_list = [ os.path.basename(line) for line in out_text.splitlines()]
else:
print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.')
sys.exit(1)
else: # Windows single disk
if sys.platform != "win32":
print('[!]Usage: python3 ./number_parser.py ALL')
sys.exit(0)
# ES_prog_path = 'C:/greensoft/es/es.exe'
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
if os.path.isdir(sys.argv[1]):
ES_search_path = sys.argv[1]
else:
ES_search_path = sys.argv[1][0] + ':/'
if not os.path.isdir(ES_search_path):
ES_search_path = 'C:/'
ES_search_path = os.path.normcase(ES_search_path)
ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v'
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030此编码为UNICODE方言与UTF-8系全射关系无转码损失
out_list = out_text.splitlines()
print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...')
print(f'[+]Find {len(out_list)} Movies.')
for filename in out_list:
try:
n = get_number(True, filename)
if n:
print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else ''))
else:
print(f'[-]Number return None. # {filename}')
except Exception as e:
print(f'[-]Number Parser exception: {e} [{filename}]')
sys.exit(0)

View File

@@ -10,7 +10,8 @@ pyinstaller --onefile AV_Data_Capture.py `
--hidden-import ADC_function.py `
--hidden-import core.py `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
--add-data "Img;Img"
--add-data "Img;Img" `
--add-data "config.ini;." `
rmdir -Recurse -Force build
rmdir -Recurse -Force __pycache__

View File

@@ -1,4 +1,8 @@
pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448
pip install pyquery pyinstaller
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "Img:Img" \
--add-data "config.ini:." \
cp config.ini ./dist

View File

@@ -12,5 +12,9 @@
#fi
pip3 install -r requirements.txt
pip3 install cloudscraper==1.2.52
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "Img:Img" \
--add-data "config.ini:." \
cp config.ini ./dist