Merge pull request #607 from lededev/log-3

继续完善上个月提交的新功能
This commit is contained in:
Yoshiko2
2021-10-22 00:30:38 +08:00
committed by GitHub
23 changed files with 1669 additions and 847 deletions

View File

@@ -42,6 +42,8 @@ jobs:
--hidden-import ADC_function.py \ --hidden-import ADC_function.py \
--hidden-import core.py \ --hidden-import core.py \
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "Img:Img" \
--add-data "config.ini:." \
- name: Build with PyInstaller for windows - name: Build with PyInstaller for windows
if: matrix.os == 'windows-latest' if: matrix.os == 'windows-latest'
@@ -51,6 +53,8 @@ jobs:
--hidden-import ADC_function.py ` --hidden-import ADC_function.py `
--hidden-import core.py ` --hidden-import core.py `
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
--add-data "Img;Img" `
--add-data "config.ini;." `
- name: Copy config.ini - name: Copy config.ini
run: | run: |

View File

@@ -1,8 +1,8 @@
from os import replace from os import replace
import requests import requests
import hashlib #import hashlib
from pathlib import Path from pathlib import Path
import random import secrets
import os.path import os.path
import uuid import uuid
import json import json
@@ -20,12 +20,12 @@ def getXpathSingle(htmlcode, xpath):
return result1 return result1
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
# 网页请求核心 # 网页请求核心
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
verify = config.Config().cacert_file() verify = config.getInstance().cacert_file()
configProxy = config.Config().proxy() configProxy = config.getInstance().proxy()
errors = "" errors = ""
if ua is None: if ua is None:
@@ -61,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None)
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
configProxy = config.Config().proxy() configProxy = config.getInstance().proxy()
errors = "" errors = ""
headers_ua = {"User-Agent": G_USER_AGENT} headers_ua = {"User-Agent": G_USER_AGENT}
if headers is None: if headers is None:
@@ -85,8 +85,12 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) s = None
configProxy = config.Config().proxy() if isinstance(cookies, dict) and len(cookies):
s = requests.Session()
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
configProxy = config.getInstance().proxy()
if configProxy.enable: if configProxy.enable:
browser.session.proxies = configProxy.proxies() browser.session.proxies = configProxy.proxies()
result = browser.open(url) result = browser.open(url)
@@ -103,17 +107,19 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
return result.text return result.text
def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) s = None
if isinstance(cookies, dict): if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) s = requests.Session()
configProxy = config.Config().proxy() requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
configProxy = config.getInstance().proxy()
if configProxy.enable: if configProxy.enable:
browser.session.proxies = configProxy.proxies() browser.session.proxies = configProxy.proxies()
result = browser.open(url) result = browser.open(url)
if not result.ok: if not result.ok:
return '' return ''
form = browser.select_form() if form_name is None else browser.select_form(form_name) form = browser.select_form() if form_select is None else browser.select_form(form_select)
if isinstance(fields, dict): if isinstance(fields, dict):
for k, v in fields.items(): for k, v in fields.items():
browser[k] = v browser[k] = v
@@ -131,7 +137,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
# def get_javlib_cookie() -> [dict, str]: # def get_javlib_cookie() -> [dict, str]:
# import cloudscraper # import cloudscraper
# switch, proxy, timeout, retry_count, proxytype = config.Config().proxy() # switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy()
# proxies = get_proxy(proxy, proxytype) # proxies = get_proxy(proxy, proxytype)
# #
# raw_cookie = {} # raw_cookie = {}
@@ -158,7 +164,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
def translateTag_to_sc(tag): def translateTag_to_sc(tag):
tranlate_to_sc = config.Config().transalte_to_sc() tranlate_to_sc = config.getInstance().transalte_to_sc()
if tranlate_to_sc: if tranlate_to_sc:
dict_gen = {'中文字幕': '中文字幕', dict_gen = {'中文字幕': '中文字幕',
'高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头', '高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头',
@@ -505,8 +511,11 @@ def translate(
delay: int = 0, delay: int = 0,
): ):
trans_result = "" trans_result = ""
# 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的
if not is_japanese(src):
return src
if engine == "google-free": if engine == "google-free":
gsite = config.Config().get_translate_service_site() gsite = config.getInstance().get_translate_service_site()
if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite): if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite):
gsite = 'translate.google.cn' gsite = 'translate.google.cn'
url = ( url = (
@@ -521,7 +530,7 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
trans_result = trans_result.join(translate_list) trans_result = trans_result.join(translate_list)
# elif engine == "baidu": # elif engine == "baidu":
# url = "https://fanyi-api.baidu.com/api/trans/vip/translate" # url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
# salt = random.randint(1, 1435660288) # salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288)
# sign = app_id + src + str(salt) + key # sign = app_id + src + str(salt) + key
# sign = hashlib.md5(sign.encode()).hexdigest() # sign = hashlib.md5(sign.encode()).hexdigest()
# url += ( # url += (
@@ -560,17 +569,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
return trans_result return trans_result
# ========================================================================是否为无码
def is_uncensored(number):
if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper():
return True
configs = config.Config().get_uncensored()
prefix_list = str(configs).split(',')
for pre in prefix_list:
if pre.upper() in number.upper():
return True
return False
# 从浏览器中导出网站登录验证信息的cookies能够以会员方式打开游客无法访问到的页面 # 从浏览器中导出网站登录验证信息的cookies能够以会员方式打开游客无法访问到的页面
# 示例: FC2-755670 url https://javdb9.com/v/vO8Mn # 示例: FC2-755670 url https://javdb9.com/v/vO8Mn
# json 文件格式 # json 文件格式
@@ -593,20 +591,20 @@ def load_cookies(filename):
filename = os.path.basename(filename) filename = os.path.basename(filename)
if not len(filename): if not len(filename):
return None, None return None, None
path_search_order = [ path_search_order = (
f"./{filename}", Path.cwd() / filename,
os.path.join(Path.home(), filename), Path.home() / filename,
os.path.join(Path.home(), f".avdc/{filename}"), Path.home() / f".avdc/{filename}",
os.path.join(Path.home(), f".local/share/avdc/{filename}") Path.home() / f".local/share/avdc/{filename}"
] )
cookies_filename = None cookies_filename = None
try:
for p in path_search_order: for p in path_search_order:
if os.path.exists(p): if p.is_file():
cookies_filename = os.path.abspath(p) cookies_filename = str(p.resolve())
break break
if not cookies_filename: if not cookies_filename:
return None, None return None, None
try:
return json.load(open(cookies_filename)), cookies_filename return json.load(open(cookies_filename)), cookies_filename
except: except:
return None, None return None, None
@@ -623,10 +621,9 @@ def file_modification_days(filename) -> int:
return 9999 return 9999
return days return days
# 检查文件是否是链接 def file_not_exist_or_empty(filepath) -> bool:
def is_link(filename: str): return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0
if os.path.islink(filename):
return True # symlink # 日语简单检测
elif os.stat(filename).st_nlink > 1: def is_japanese(s) -> bool:
return True # hard link Linux MAC OSX Windows NTFS return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))
return False

View File

@@ -6,12 +6,13 @@ import sys
import shutil import shutil
import typing import typing
import urllib3 import urllib3
import signal
import config import config
from datetime import datetime, timedelta from datetime import datetime, timedelta
import time import time
from pathlib import Path from pathlib import Path
from ADC_function import file_modification_days, get_html, is_link from ADC_function import file_modification_days, get_html
from number_parser import get_number from number_parser import get_number
from core import core_main, moveFailedFolder from core import core_main, moveFailedFolder
@@ -35,30 +36,54 @@ def check_update(local_version):
def argparse_function(ver: str) -> typing.Tuple[str, str, bool]: def argparse_function(ver: str) -> typing.Tuple[str, str, bool]:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) conf = config.getInstance()
parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.") parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.")
# parser.add_argument("-c", "--config", default='config.ini', nargs='?', help="The config file Path.") parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
default_logdir = os.path.join(Path.home(),'.avlogs') parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
# parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
default_logdir = str(Path.home() / '.avlogs')
parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?', parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?',
help=f"""Duplicate stdout and stderr to logfiles help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
in logging folder, default on. default folder for current user: '{default_logdir}'. Change default folder to an empty file,
default for current user: {default_logdir} or use --log-dir= to turn log off.""")
Use --log-dir= to turn off logging feature.""")
parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number")
parser.add_argument("-a", "--auto-exit", dest='autoexit', action="store_true",
help="Auto exit after program complete")
parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.") parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.")
parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.")
parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.")
parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
parser.add_argument("-a", "--auto-exit", action="store_true",
help="Auto exit after program complete")
parser.add_argument("-g","--debug", action="store_true",
help="Turn on debug mode to generate diagnostic log for issue report.")
parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true",
help="""Only show job list of files and numbers, and **NO** actual operation
is performed. It may help you correct wrong numbers before real job.""")
parser.add_argument("-v", "--version", action="version", version=ver) parser.add_argument("-v", "--version", action="version", version=ver)
#ini_path
args = parser.parse_args() args = parser.parse_args()
def get_natural_number_or_none(value):
return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None
def get_str_or_none(value):
return value if isinstance(value, str) and len(value) else None
def get_bool_or_none(value):
return True if isinstance(value, bool) and value else None
config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode)
config.G_conf_override["common:source_folder"] = get_str_or_none(args.path)
config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit)
config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days)
config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt)
config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list)
config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug)
return args.file, args.path, args.number, args.autoexit, args.logdir, args.regexstr return args.file, args.number, args.logdir, args.regexstr, args.zero_op
class OutLogger(object): class OutLogger(object):
def __init__(self, logfile) -> None: def __init__(self, logfile) -> None:
self.term = sys.stdout self.term = sys.stdout
self.log = open(logfile,"w",encoding='utf-8',buffering=1) self.log = open(logfile,"w",encoding='utf-8',buffering=1)
self.filepath = logfile
def __del__(self): def __del__(self):
self.close() self.close()
def __enter__(self): def __enter__(self):
@@ -85,6 +110,7 @@ class ErrLogger(OutLogger):
def __init__(self, logfile) -> None: def __init__(self, logfile) -> None:
self.term = sys.stderr self.term = sys.stderr
self.log = open(logfile,"w",encoding='utf-8',buffering=1) self.log = open(logfile,"w",encoding='utf-8',buffering=1)
self.filepath = logfile
def close(self): def close(self):
if self.term != None: if self.term != None:
sys.stderr = self.term sys.stderr = self.term
@@ -97,14 +123,18 @@ class ErrLogger(OutLogger):
def dupe_stdout_to_logfile(logdir: str): def dupe_stdout_to_logfile(logdir: str):
if not isinstance(logdir, str) or len(logdir) == 0: if not isinstance(logdir, str) or len(logdir) == 0:
return return
if not os.path.isdir(logdir): log_dir = Path(logdir)
os.makedirs(logdir) if not log_dir.exists():
if not os.path.isdir(logdir): try:
return log_dir.mkdir(parents=True,exist_ok=True)
except:
pass
if not log_dir.is_dir():
return # Tips for disabling logs by change directory to a same name empty regular file
abslog_dir = log_dir.resolve()
log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S")
logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt') logfile = abslog_dir / f'avdc_{log_tmstr}.txt'
errlog = os.path.join(logdir, f'avdc_{log_tmstr}_err.txt') errlog = abslog_dir / f'avdc_{log_tmstr}_err.txt'
sys.stdout = OutLogger(logfile) sys.stdout = OutLogger(logfile)
sys.stderr = ErrLogger(errlog) sys.stderr = ErrLogger(errlog)
@@ -113,28 +143,126 @@ def dupe_stdout_to_logfile(logdir: str):
def close_logfile(logdir: str): def close_logfile(logdir: str):
if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir):
return return
sys.stdout.close() #日志关闭前保存日志路径
sys.stderr.close() filepath = None
# 清理空文件
for current_dir, subdirs, files in os.walk(logdir, topdown=False):
try: try:
for f in files: filepath = sys.stdout.filepath
full_name = os.path.join(current_dir, f)
if os.path.getsize(full_name) == 0:
os.remove(full_name)
except: except:
pass pass
sys.stdout.close()
sys.stderr.close()
log_dir = Path(logdir).resolve()
if isinstance(filepath, Path):
print(f"Log file '{filepath}' saved.")
assert(filepath.parent.samefile(log_dir))
# 清理空文件
for f in log_dir.glob(r'*_err.txt'):
if f.stat().st_size == 0:
try:
f.unlink(missing_ok=True)
except:
pass
# 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志,
# 按月合并为单个月志去年及以前的月志今年4月以后将之按年合并为年志
# 测试步骤:
"""
LOGDIR=/tmp/avlog
mkdir -p $LOGDIR
for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done
for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done
for f in {00..23};do;echo 20211001T$f>$LOGDIR/avdc_20211001T${f}5959.txt;done
echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR"
# 1932 files in /tmp/avlog
avdc -zgic1 -d0 -m3 -o $LOGDIR
# python3 ./AV_Data_Capture.py -zgic1 -o $LOGDIR
ls $LOGDIR
# rm -rf $LOGDIR
"""
today = datetime.today()
# 第一步合并到日。3天前的日志文件名是同一天的合并为一份日志
for i in range(1):
txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}T\d{6}$', f.stem, re.A)]
if not txts or not len(txts):
break
e = [f for f in txts if '_err' in f.stem]
txts.sort()
tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99")
deadline_day = f'avdc_{tmstr_3_days_ago}'
day_merge = [f for f in txts if f.stem < deadline_day]
if not day_merge or not len(day_merge):
break
cutday = len('T235959.txt') # cut length avdc_20201201|T235959.txt
for f in day_merge:
try:
day_file_name = str(f)[:-cutday] + '.txt' # avdc_20201201.txt
with open(day_file_name, 'a', encoding='utf-8') as m:
m.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第二步,合并到月
for i in range(1): # 利用1次循环的break跳到第二步避免大块if缩进或者使用goto语法
txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}$', f.stem, re.A)]
if not txts or not len(txts):
break
txts.sort()
tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32")
deadline_month = f'avdc_{tmstr_3_month_ago}'
month_merge = [f for f in txts if f.stem < deadline_month]
if not month_merge or not len(month_merge):
break
tomonth = len('01.txt') # cut length avdc_202012|01.txt
for f in month_merge:
try:
month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt
with open(month_file_name, 'a', encoding='utf-8') as m:
m.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第三步,月合并到年
if today.month < 4:
return
mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{6}$', f.stem, re.A)]
if not mons or not len(mons):
return
mons.sort()
deadline_year = f'avdc_{today.year-1}13'
year_merge = [f for f in mons if f.stem < deadline_year]
if not year_merge or not len(year_merge):
return
toyear = len('12.txt') # cut length avdc_2020|12.txt
for f in year_merge:
try:
year_file_name = str(f)[:-toyear] + '.txt' # avdc_2020.txt
with open(year_file_name, 'a', encoding='utf-8') as y:
y.write(f.read_text(encoding='utf-8'))
f.unlink(missing_ok=True)
except:
pass
# 第四步,压缩年志 如果有压缩需求请自行手工压缩或者使用外部脚本来定时完成。推荐nongnu的lzip对于
# 这种粒度的文本日志压缩比是目前最好的。lzip -9的运行参数下日志压缩比要高于xz -9而且内存占用更少
# 多核利用率更高(plzip多线程版本)解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右,
# 100MB的日志文件能缩小到3.7MB。
# 重写视频文件扫描,消除递归,取消全局变量,新增失败文件列表跳过处理 def signal_handler(*args):
def movie_lists(root, conf, regexstr): print('[!]Ctrl+C detected, Exit.')
escape_folder = re.split("[,]", conf.escape_folder()) sys.exit(9)
def sigdebug_handler(*args):
config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"]
print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF'))
# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理提示跳过视频总数调试模式(-g)下详细被跳过文件,跳过小广告
def movie_lists(source_folder, regexstr):
conf = config.getInstance()
main_mode = conf.main_mode() main_mode = conf.main_mode()
debug = conf.debug() debug = conf.debug()
nfo_skip_days = conf.nfo_skip_days() nfo_skip_days = conf.nfo_skip_days()
soft_link = conf.soft_link() soft_link = conf.soft_link()
total = [] file_type = conf.media_type().lower().split(",")
file_type = conf.media_type().upper().split(",")
trailerRE = re.compile(r'-trailer\.', re.IGNORECASE) trailerRE = re.compile(r'-trailer\.', re.IGNORECASE)
cliRE = None cliRE = None
if isinstance(regexstr, str) and len(regexstr): if isinstance(regexstr, str) and len(regexstr):
@@ -142,72 +270,94 @@ def movie_lists(root, conf, regexstr):
cliRE = re.compile(regexstr, re.IGNORECASE) cliRE = re.compile(regexstr, re.IGNORECASE)
except: except:
pass pass
failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt'
failed_set = set() failed_set = set()
if main_mode == 3 or soft_link: if (main_mode == 3 or soft_link) and not conf.ignore_failed_list():
try: try:
with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'r', encoding='utf-8') as flt: flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines()
flist = flt.read().splitlines()
failed_set = set(flist) failed_set = set(flist)
flt.close() if len(flist) != len(failed_set): # 检查去重并写回但是不改变failed_list.txt内条目的先后次序重复的只保留最后的
if len(flist) != len(failed_set): fset = failed_set.copy()
with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'w', encoding='utf-8') as flt: for i in range(len(flist)-1, -1, -1):
flt.writelines([line + '\n' for line in failed_set]) fset.remove(flist[i]) if flist[i] in fset else flist.pop(i)
flt.close() failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8')
assert len(fset) == 0 and len(flist) == len(failed_set)
except: except:
pass pass
for current_dir, subdirs, files in os.walk(root, topdown=False): if not Path(source_folder).is_dir():
if len(set(current_dir.replace("\\","/").split("/")) & set(escape_folder)) > 0: print('[-]Source folder not found!')
return []
total = []
source = Path(source_folder).resolve()
skip_failed_cnt, skip_nfo_days_cnt = 0, 0
escape_folder_set = set(re.split("[,]", conf.escape_folder()))
for full_name in source.glob(r'**/*'):
if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
continue continue
for f in files: if not full_name.suffix.lower() in file_type:
full_name = os.path.join(current_dir, f)
if not os.path.splitext(full_name)[1].upper() in file_type:
continue continue
absf = os.path.abspath(full_name) absf = str(full_name)
if absf in failed_set: if absf in failed_set:
skip_failed_cnt += 1
if debug: if debug:
print('[!]Skip failed file:', absf) print('[!]Skip failed movie:', absf)
continue continue
if cliRE and not cliRE.search(absf): is_sym = full_name.is_symlink()
if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标
continue # file is symlink or hardlink(Linux/NTFS/Darwin)
# 调试用0字节样本允许通过去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size直接赋0跳过小视频检测
if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120
continue continue
if main_mode == 3 and nfo_skip_days > 0: if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
nfo = Path(absf).with_suffix('.nfo') continue
if file_modification_days(nfo) <= nfo_skip_days: if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days:
skip_nfo_days_cnt += 1
if debug:
print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
continue continue
if (main_mode == 3 or not is_link(absf)) and not trailerRE.search(f):
total.append(absf) total.append(absf)
if skip_failed_cnt:
print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.")
if skip_nfo_days_cnt:
print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
if nfo_skip_days <= 0 or not soft_link or main_mode == 3: if nfo_skip_days <= 0 or not soft_link or main_mode == 3:
return total return total
# 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的 # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数跳过N天内更新过的
skip_numbers = set() skip_numbers = set()
success_folder = conf.success_folder() success_folder = Path(conf.success_folder()).resolve()
for current_dir, subdirs, files in os.walk(success_folder, topdown=False): for f in success_folder.glob(r'**/*'):
for f in files: if not re.match(r'\.nfo', f.suffix, re.IGNORECASE):
f_obj = Path(f)
if f_obj.suffix.lower() != '.nfo':
continue continue
if file_modification_days(Path(current_dir) / f_obj) > nfo_skip_days: if file_modification_days(f) > nfo_skip_days:
continue continue
number = get_number(False, f_obj.stem) number = get_number(False, f.stem)
if number: if not number:
skip_numbers.add(number.upper()) continue
skip_numbers.add(number.lower())
rm_list = [] rm_list = []
for f in total: for f in total:
n_number = get_number(False, os.path.basename(f)) n_number = get_number(False, os.path.basename(f))
if n_number and n_number.upper() in skip_numbers: if n_number and n_number.lower() in skip_numbers:
rm_list.append(f) rm_list.append(f)
for f in rm_list: for f in rm_list:
total.remove(f) total.remove(f)
if debug:
print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'")
if len(rm_list):
print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.")
return total return total
def create_failed_folder(failed_folder): def create_failed_folder(failed_folder):
if not os.path.isdir(failed_folder): # 新建failed文件夹 if not os.path.exists(failed_folder): # 新建failed文件夹
try: try:
os.makedirs(failed_folder) os.makedirs(failed_folder)
if not os.path.isdir(failed_folder):
raise
except: except:
print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)") print(f"[-]Fatal error! Can not make folder '{failed_folder}'")
sys.exit(0) sys.exit(0)
@@ -227,24 +377,29 @@ def rm_empty_folder(path):
pass pass
def create_data_and_move(file_path: str, c: config.Config, debug): def create_data_and_move(file_path: str, zero_op):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
file_name = os.path.basename(file_path) debug = config.getInstance().debug()
n_number = get_number(debug, file_name) n_number = get_number(debug, os.path.basename(file_path))
file_path = os.path.abspath(file_path) file_path = os.path.abspath(file_path)
if debug == True: if debug == True:
print(f"[!]Making Data for [{file_path}], the number is [{n_number}]") print(f"[!] [{n_number}] As Number making data for '{file_path}'")
if zero_op:
return
if n_number: if n_number:
core_main(file_path, n_number, c) core_main(file_path, n_number)
else: else:
print("[-] number empty ERROR") print("[-] number empty ERROR")
moveFailedFolder(file_path)
print("[*]======================================================") print("[*]======================================================")
else: else:
try: try:
print(f"[!]Making Data for [{file_path}], the number is [{n_number}]") print(f"[!] [{n_number}] As Number making data for '{file_path}'")
if zero_op:
return
if n_number: if n_number:
core_main(file_path, n_number, c) core_main(file_path, n_number)
else: else:
raise ValueError("number empty") raise ValueError("number empty")
print("[*]======================================================") print("[*]======================================================")
@@ -253,22 +408,26 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
print('[-]', err) print('[-]', err)
try: try:
moveFailedFolder(file_path, conf) moveFailedFolder(file_path)
except Exception as err: except Exception as err:
print('[!]', err) print('[!]', err)
def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number): def create_data_and_move_with_custom_number(file_path: str, custom_number):
conf = config.getInstance()
file_name = os.path.basename(file_path) file_name = os.path.basename(file_path)
try: try:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, custom_number)) print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
core_main(file_path, custom_number, c) if custom_number:
core_main(file_path, custom_number)
else:
print("[-] number empty ERROR")
print("[*]======================================================") print("[*]======================================================")
except Exception as err: except Exception as err:
print("[-] [{}] ERROR:".format(file_path)) print("[-] [{}] ERROR:".format(file_path))
print('[-]', err) print('[-]', err)
if c.soft_link(): if conf.soft_link():
print("[-]Link {} to failed folder".format(file_path)) print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, os.path.join(conf.failed_folder(), file_name)) os.symlink(file_path, os.path.join(conf.failed_folder(), file_name))
else: else:
@@ -279,12 +438,26 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
print('[!]', err) print('[!]', err)
if __name__ == '__main__': def main():
version = '5.0.1' version = '5.0.1'
urllib3.disable_warnings() #Ignore http proxy warning urllib3.disable_warnings() #Ignore http proxy warning
# Parse command line args
single_file_path, folder_path, custom_number, auto_exit, logdir, regexstr = argparse_function(version)
# Read config.ini first, in argparse_function() need conf.failed_folder()
conf = config.Config("config.ini")
# Parse command line args
single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version)
main_mode = conf.main_mode()
if not main_mode in (1, 2, 3):
print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
sys.exit(4)
signal.signal(signal.SIGINT, signal_handler)
if sys.platform == 'win32':
signal.signal(signal.SIGBREAK, sigdebug_handler)
else:
signal.signal(signal.SIGWINCH, sigdebug_handler)
dupe_stdout_to_logfile(logdir) dupe_stdout_to_logfile(logdir)
print('[*]================== AV Data Capture ===================') print('[*]================== AV Data Capture ===================')
@@ -293,55 +466,62 @@ if __name__ == '__main__':
print('[*]======================================================') print('[*]======================================================')
print('[*]严禁在墙内宣传本项目') print('[*]严禁在墙内宣传本项目')
# Read config.ini start_time = time.time()
conf = config.Config("config.ini") print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
if conf.update_check(): if conf.update_check():
check_update(version) check_update(version)
print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug(): if conf.debug():
print('[+]Enable debug') print('[+]Enable debug')
if conf.soft_link(): if conf.soft_link():
print('[!]Enable soft link') print('[!]Enable soft link')
#print('[!]CmdLine:'," ".join(sys.argv[1:])) if len(sys.argv)>1:
print('[!]CmdLine:'," ".join(sys.argv[1:]))
print('[+]Main Working mode ## {}: {} ## {}{}{}'
.format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1],
"" if not conf.multi_threading() else ", multi_threading on",
"" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}",
"" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}"
) if not single_file_path else ('-','Single File', '','',''))
)
create_failed_folder(conf.failed_folder()) create_failed_folder(conf.failed_folder())
start_time = time.time()
if not single_file_path == '': #Single File if not single_file_path == '': #Single File
print('[+]==================== Single File =====================') print('[+]==================== Single File =====================')
if custom_number == '': if custom_number == '':
create_data_and_move_with_custom_number(single_file_path, conf, get_number(conf.debug(), os.path.basename(single_file_path))) create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)))
else: else:
create_data_and_move_with_custom_number(single_file_path, conf, custom_number) create_data_and_move_with_custom_number(single_file_path, custom_number)
else: else:
if folder_path == '': folder_path = conf.source_folder()
if not isinstance(folder_path, str) or folder_path == '':
folder_path = os.path.abspath(".") folder_path = os.path.abspath(".")
movie_list = movie_lists(folder_path, conf, regexstr) movie_list = movie_lists(folder_path, regexstr)
count = 0 count = 0
count_all = str(len(movie_list)) count_all = str(len(movie_list))
print('[+]Find', count_all, 'movies. Start at', time.strftime("%Y-%m-%d %H:%M:%S")) print('[+]Find', count_all, 'movies.')
main_mode = conf.main_mode() print('[*]======================================================')
stop_count = conf.stop_counter() stop_count = conf.stop_counter()
if stop_count<1: if stop_count<1:
stop_count = 999999 stop_count = 999999
else: else:
count_all = str(min(len(movie_list), stop_count)) count_all = str(min(len(movie_list), stop_count))
if main_mode == 3:
print(f'[!]运行模式:**维护模式**,本程序将在处理{count_all}个视频文件后停止,如需后台执行自动退出请结合 -a 参数。')
for movie_path in movie_list: # 遍历电影列表 交给core处理 for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1 count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%' percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -') print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
create_data_and_move(movie_path, conf, conf.debug()) create_data_and_move(movie_path, zero_op)
if count >= stop_count: if count >= stop_count:
print("[!]Stop counter triggered!") print("[!]Stop counter triggered!")
break break
if conf.del_empty_folder(): if conf.del_empty_folder() and not zero_op:
rm_empty_folder(conf.success_folder()) rm_empty_folder(conf.success_folder())
rm_empty_folder(conf.failed_folder()) rm_empty_folder(conf.failed_folder())
if len(folder_path): if len(folder_path):
@@ -353,9 +533,15 @@ if __name__ == '__main__':
" End at", time.strftime("%Y-%m-%d %H:%M:%S")) " End at", time.strftime("%Y-%m-%d %H:%M:%S"))
print("[+]All finished!!!") print("[+]All finished!!!")
if not (conf.auto_exit() or auto_exit):
input("Press enter key exit, you can check the error message before you exit...")
close_logfile(logdir) close_logfile(logdir)
if not conf.auto_exit():
input("Press enter key exit, you can check the error message before you exit...")
sys.exit(0) sys.exit(0)
import multiprocessing
if __name__ == '__main__':
multiprocessing.freeze_support()
main()

View File

@@ -16,7 +16,9 @@ make:
#export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1) #export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1)
@echo "[+]Pyinstaller make" @echo "[+]Pyinstaller make"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "Img:Img" pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "Img:Img" \
--add-data "config.ini:." \
@echo "[+]Move to bin" @echo "[+]Move to bin"
if [ ! -d "./bin" ];then mkdir bin; fi if [ ! -d "./bin" ];then mkdir bin; fi

View File

@@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
return True return True
def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据 def get_data_from_json(file_number): # 从JSON返回元数据
""" """
iterate through all services and fetch the data iterate through all services and fetch the data
""" """
@@ -53,6 +53,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
"fc2club": fc2club.main "fc2club": fc2club.main
} }
conf = config.getInstance()
# default fetch order list, from the beginning to the end # default fetch order list, from the beginning to the end
sources = conf.sources().split(',') sources = conf.sources().split(',')
if not len(conf.sources()) > 80: if not len(conf.sources()) > 80:
@@ -114,6 +115,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break # if any service return a valid return, break
if get_data_state(json_data): if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break break
pool.close() pool.close()
pool.terminate() pool.terminate()
@@ -125,6 +127,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
json_data = json.loads(func_mapping[source](file_number)) json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break # if any service return a valid return, break
if get_data_state(json_data): if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break break
except: except:
break break
@@ -134,6 +137,14 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
print('[-]Movie Number not found!') print('[-]Movie Number not found!')
return None return None
# 增加number严格判断避免提交任何number总是返回"本橋実来 ADZ335"这种返回number不一致的数据源故障
# 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
# 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
# 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分可考虑更换规则更新相应的number分析和抓取代码。
if str(json_data.get('number')).upper() != file_number.upper():
print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
return None
# ================================================网站规则添加结束================================================ # ================================================网站规则添加结束================================================
title = json_data.get('title') title = json_data.get('title')
@@ -167,6 +178,10 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
imagecut = json_data.get('imagecut') imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
while 'XXXX' in tag:
tag.remove('XXXX')
while 'xxx' in tag:
tag.remove('xxx')
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '': if title == '' or number == '':
@@ -225,6 +240,8 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
studio = studio.replace('エムズビデオグループ','Ms Video Group') studio = studio.replace('エムズビデオグループ','Ms Video Group')
studio = studio.replace('ミニマム','Minimum') studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment') studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
studio = studio.replace('パコパコママ','pacopacomama')
studio = re.sub('.*/妄想族','妄想族',studio) studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ') studio = studio.replace('/',' ')
# === 替换Studio片假名 END # === 替换Studio片假名 END
@@ -293,4 +310,7 @@ def special_characters_replacement(text) -> str:
replace('"', ''). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane replace('"', ''). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ')) # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('&lsquo;', ''). # U+02018 LEFT SINGLE QUOTATION MARK
replace('&rsquo;', ''). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&amp;', ''))

View File

@@ -6,6 +6,7 @@ from lxml import etree#need install
from bs4 import BeautifulSoup#need install from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
from WebCrawler import javbus
''' '''
API API
@@ -17,95 +18,94 @@ API
host = 'https://www.airav.wiki' host = 'https://www.airav.wiki'
# airav这个网站没有演员图片所以直接使用javbus的图 # airav这个网站没有演员图片所以直接使用javbus的图
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(javbus_json):
soup = BeautifulSoup(htmlcode, 'lxml') result = javbus_json.get('actor_photo')
a = soup.find_all(attrs={'class': 'star-name'}) if isinstance(result, dict) and len(result):
d={} return result
for i in a: return ''
l=i.a['href']
t=i.get_text()
html = etree.fromstring(get_html(l), etree.HTMLParser())
p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p}
d.update(p2)
return d
def getTitle(htmlcode): #获取标题 def getTitle(htmlcode): #获取标题
doc = pq(htmlcode) html = etree.fromstring(htmlcode, etree.HTMLParser())
# h5:first-child定位第一个h5标签妈的找了好久才找到这个语法 title = str(html.xpath('/html/head/title/text()')[0])
title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-') result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
try: return result
title2 = re.sub('n\d+-','',title)
return title2 def getStudio(htmlcode, javbus_json): #获取厂商 已修改
# javbus如果有数据以它为准
result = javbus_json.get('studio')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode,etree.HTMLParser())
return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
def getYear(htmlcode, javbus_json): #获取年份
result = javbus_json.get('year')
if isinstance(result, str) and len(result):
return result
release = getRelease(htmlcode, javbus_json)
if len(release) != len('2000-01-01'):
return ''
return release[:4]
def getCover(htmlcode, javbus_json): #获取封面图片
result = javbus_json.get('cover')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
def getRelease(htmlcode, javbus_json): #获取出版日期
result = javbus_json.get('release')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
except: except:
return title return ''
def getStudio(htmlcode): #获取厂商 已修改
html = etree.fromstring(htmlcode,etree.HTMLParser())
# 如果记录中冇导演厂商排在第4位
if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
# 如果记录中有导演厂商排在第5位
elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
else:
result = ''
return result return result
def getYear(htmlcode): #获取年份 def getRuntime(javbus_json): #获取播放时长
html = etree.fromstring(htmlcode,etree.HTMLParser()) result = javbus_json.get('runtime')
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") if isinstance(result, str) and len(result):
return result return result
def getCover(htmlcode): #获取封面链接 return ''
doc = pq(htmlcode) # airav女优数据库较多日文汉字姓名javbus较多日语假名因此airav优先
image = doc('a.bigImage') def getActor(htmlcode, javbus_json): #获取女优
return urljoin("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getRuntime(htmlcode): #获取分钟 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result
def getActor(htmlcode): #获取女优
b=[] b=[]
soup=BeautifulSoup(htmlcode,'lxml') html = etree.fromstring(htmlcode, etree.HTMLParser())
a=soup.find_all(attrs={'class':'star-name'}) a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
for i in a: for v in a:
b.append(i.get_text()) v = v.strip()
if len(v):
b.append(v)
if len(b):
return b return b
def getNum(htmlcode): #获取番号 result = javbus_json.get('actor')
html = etree.fromstring(htmlcode, etree.HTMLParser()) if isinstance(result, list) and len(result):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
return result return result
def getDirector(htmlcode): #获取导演 已修改 return []
html = etree.fromstring(htmlcode, etree.HTMLParser()) def getNum(htmlcode, javbus_json): #获取番号
if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): result = javbus_json.get('number')
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") if isinstance(result, str) and len(result):
else:
result = '' # 记录中有可能没有导演数据
return result return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
def getOutline(htmlcode): #获取演员 title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('^\[(.*?)]', title)[0])
return result
def getDirector(javbus_json): #获取导演 已修改
result = javbus_json.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getOutline(htmlcode): #获取概述
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
try: try:
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','') result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
return result return result
except: except:
return '' return ''
def getSerise(htmlcode): #获取系列 已修改 def getSerise(javbus_json): #获取系列 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser()) result = javbus_json.get('series')
# 如果记录中冇导演系列排在第6位 if isinstance(result, str) and len(result):
if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
# 如果记录中有导演系列排在第7位
elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
else:
result = ''
return result return result
return ''
def getTag(htmlcode): # 获取标签 def getTag(htmlcode): # 获取标签
tag = [] tag = []
soup = BeautifulSoup(htmlcode, 'lxml') soup = BeautifulSoup(htmlcode, 'lxml')
@@ -169,52 +169,50 @@ def main(number):
try: try:
try: try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number) htmlcode = get_html('https://cn.airav.wiki/video/' + number)
javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number) javbus_json = json.loads(javbus.main(number))
except: except:
print(number) print(number)
dic = { dic = {
# 标题可使用airav # 标题可使用airav
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'title': getTitle(htmlcode),
# 制作商选择使用javbus # 制作商先找javbus,如果没有再找本站
'studio': getStudio(javbus_htmlcode), 'studio': getStudio(htmlcode, javbus_json),
# 年份也是用javbus # 年份先试javbus,如果没有再找本站
'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()), 'year': getYear(htmlcode, javbus_json),
# 简介 使用 airav # 简介 使用 airav
'outline': getOutline(htmlcode), 'outline': getOutline(htmlcode),
# 使用javbus # 使用javbus
'runtime': getRuntime(javbus_htmlcode), 'runtime': getRuntime(javbus_json),
# 导演 使用javbus # 导演 使用javbus
'director': getDirector(javbus_htmlcode), 'director': getDirector(javbus_json),
# 作者 使用airav # 演员 先试airav
'actor': getActor(javbus_htmlcode), 'actor': getActor(htmlcode, javbus_json),
# 发售日使用javbus # 发售日先试javbus
'release': getRelease(javbus_htmlcode), 'release': getRelease(htmlcode, javbus_json),
# 番号使用javbus # 番号使用javbus
'number': getNum(javbus_htmlcode), 'number': getNum(htmlcode, javbus_json),
# 封面链接 使用javbus # 封面链接 使用javbus
'cover': getCover(javbus_htmlcode), 'cover': getCover(htmlcode, javbus_json),
# 剧照获取 # 剧照获取
'extrafanart': getExtrafanart(htmlcode), 'extrafanart': getExtrafanart(htmlcode),
'imagecut': 1, 'imagecut': 1,
# 使用 airav # 使用 airav
'tag': getTag(htmlcode), 'tag': getTag(htmlcode),
# 使用javbus # 使用javbus
'label': getSerise(javbus_htmlcode), 'label': getSerise(javbus_json),
# 妈的airav不提供作者图片 # 妈的airav不提供作者图片
'actor_photo': getActorPhoto(javbus_htmlcode), # 'actor_photo': getActorPhoto(javbus_json),
'website': 'https://www.airav.wiki/video/' + number, 'website': 'https://www.airav.wiki/video/' + number,
'source': 'airav.py', 'source': 'airav.py',
# 使用javbus # 使用javbus
'series': getSerise(javbus_htmlcode), 'series': getSerise(javbus_json)
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
data = { data = {
"title": "", "title": "",
@@ -226,6 +224,6 @@ def main(number):
if __name__ == '__main__': if __name__ == '__main__':
#print(main('ADN-188')) print(main('ADV-R0624')) # javbus页面返回404, airav有数据
print(main('ADN-188')) print(main('ADN-188')) # 一人
print(main('CJOD-278')) print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名airav采用日文汉字

View File

@@ -3,50 +3,42 @@ sys.path.append('..')
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
# import sys from WebCrawler.storyline import getStoryline
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(html):
soup = BeautifulSoup(htmlcode, 'lxml') a = html.xpath('//a[@class="avatar-box"]')
a = soup.find_all(attrs={'class': 'avatar-box'})
d = {} d = {}
for i in a: for i in a:
l = i.img['src'] l = i.find('.//img').attrib['src']
t = i.span.get_text() t = i.find('span').text
p2 = {t: l} p2 = {t: l}
d.update(p2) d.update(p2)
return d return d
def getTitle(a): def getTitle(html):
try: try:
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '') return result.replace('/', '')
except: except:
return '' return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() def getActor(html):
soup = BeautifulSoup(a, 'lxml') a = html.xpath('//a[@class="avatar-box"]')
a = soup.find_all(attrs={'class': 'avatar-box'})
d = [] d = []
for i in a: for i in a:
d.append(i.span.get_text()) d.append(i.find('span').text)
return d return d
def getStudio(a): def getStudio(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1 return result1
def getRuntime(a): def getRuntime(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1 return result1
def getLabel(a): def getLabel(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1 return result1
def getNum(a): def getNum(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1 return result1
def getYear(release): def getYear(release):
@@ -55,28 +47,20 @@ def getYear(release):
return result return result
except: except:
return release return release
def getRelease(a): def getRelease(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1 return result1
def getCover(htmlcode): def getCover(html):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result return result
def getCover_small(htmlcode): def getCover_small(html):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result return result
def getTag(a): # 获取演员 def getTag(html):
soup = BeautifulSoup(a, 'lxml') x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
a = soup.find_all(attrs={'class': 'genre'}) return x[2:] if len(x) > 2 else []
d = [] def getSeries(html):
for i in a:
d.append(i.get_text())
return d
def getSeries(htmlcode):
try: try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1 return result1
except: except:
@@ -86,42 +70,45 @@ def main(number):
html = get_html('https://tellme.pw/avsox') html = get_html('https://tellme.pw/avsox')
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0] site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
a = get_html(site + '/cn/search/' + number) a = get_html(site + '/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_')) a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', '')) a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
web = get_html("https:" + result1) detail = get_html("https:" + result1)
soup = BeautifulSoup(web, 'lxml') lx = etree.fromstring(detail, etree.HTMLParser())
info = str(soup.find(attrs={'class': 'row movie'}))
try: try:
new_number = getNum(lx)
if new_number.upper() != number.upper():
raise ValueError('number not found')
title = getTitle(lx).strip(new_number)
dic = { dic = {
'actor': getActor(web), 'actor': getActor(lx),
'title': getTitle(web).strip(getNum(web)), 'title': title,
'studio': getStudio(info), 'studio': getStudio(lx),
'outline': '', # 'outline': getStoryline(number, title),
'runtime': getRuntime(info), 'runtime': getRuntime(lx),
'director': '', # 'director': '', #
'release': getRelease(info), 'release': getRelease(lx),
'number': getNum(info), 'number': new_number,
'cover': getCover(web), 'cover': getCover(lx),
'cover_small': getCover_small(a), 'cover_small': getCover_small(html),
'imagecut': 3, 'imagecut': 3,
'tag': getTag(web), 'tag': getTag(lx),
'label': getLabel(info), 'label': getLabel(lx),
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(lx)),
'actor_photo': getActorPhoto(web), 'actor_photo': getActorPhoto(lx),
'website': "https:" + result1, 'website': "https:" + result1,
'source': 'avsox.py', 'source': 'avsox.py',
'series': getSeries(info), 'series': getSeries(lx),
} }
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -129,3 +116,4 @@ def main(number):
if __name__ == "__main__": if __name__ == "__main__":
print(main('012717_472')) print(main('012717_472'))
print(main('1')) # got fake result raise 'number not found'

View File

@@ -1,34 +1,32 @@
import sys import sys
sys.path.append('../') sys.path.append('../')
import json import json
from bs4 import BeautifulSoup
from lxml import html from lxml import html
import re import re
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline
def main(number: str) -> json: def main(number: str) -> json:
try: try:
caribbytes, browser = get_html_by_browser( # 因演员图片功能还未使用为提速暂时注释改为用get_html()
'https://www.caribbeancom.com/moviepages/'+number+'/index.html', #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="browser") # return_type='browser')
#if not r.ok:
if not caribbytes or not caribbytes.ok: # raise ValueError("page not found")
#htmlcode = str(browser.page)
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
htmlcode = htmlbyte.decode('euc-jp')
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found") raise ValueError("page not found")
lx = html.fromstring(str(browser.page)) lx = html.fromstring(htmlcode)
title = get_title(lx)
if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
raise ValueError("page info not found")
except Exception as e:
if config.Config().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
dic = { dic = {
'title': get_title(lx), 'title': title,
'studio': '加勒比', 'studio': '加勒比',
'year': get_year(lx), 'year': get_year(lx),
'outline': get_outline(lx), 'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx), 'runtime': get_runtime(lx),
'director': '', 'director': '',
'actor': get_actor(lx), 'actor': get_actor(lx),
@@ -47,14 +45,25 @@ def main(number: str) -> json:
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js return js
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str: def get_title(lx: html.HtmlElement) -> str:
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip() return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
def get_year(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement) -> str: def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title)
if len(g):
return g
return o
def get_release(lx: html.HtmlElement) -> str: def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
@@ -114,11 +123,10 @@ def get_actor_photo(browser):
if pos<0: if pos<0:
continue continue
css = html[pos:pos+100] css = html[pos:pos+100]
p0 = css.find('background: url(') cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
p1 = css.find('.jpg)') if not cssBGjpgs or not len(cssBGjpgs[0]):
if p0<0 or p1<0:
continue continue
p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])} p = {k: urljoin(browser.url, cssBGjpgs[0])}
o.update(p) o.update(p)
return o return o

View File

@@ -153,7 +153,7 @@ def main(number):
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
data = { data = {
"title": "", "title": "",

View File

@@ -93,6 +93,7 @@ def main(number):
actor = '素人' actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser()) lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']") cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
dic = { dic = {
'title': lx.xpath('/html/head/title/text()')[0], 'title': lx.xpath('/html/head/title/text()')[0],
'studio': getStudio_fc2com(htmlcode2), 'studio': getStudio_fc2com(htmlcode2),
@@ -116,7 +117,7 @@ def main(number):
'series': '', 'series': '',
} }
except Exception as e: except Exception as e:
if ADC_function.config.Config().debug(): if ADC_function.config.getInstance().debug():
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -124,4 +125,5 @@ def main(number):
if __name__ == '__main__': if __name__ == '__main__':
print(main('FC2-1787685')) print(main('FC2-1787685'))
print(main('FC2-2086710'))

View File

@@ -103,7 +103,7 @@ def main(number):
'series': '', 'series': '',
} }
except Exception as e: except Exception as e:
if ADC_function.config.Config().debug(): if ADC_function.config.getInstance().debug():
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')

View File

@@ -1,114 +1,76 @@
import sys import sys
sys.path.append('../') sys.path.append('../')
import re import re
from pyquery import PyQuery as pq#need install
from lxml import etree#need install from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json import json
from ADC_function import * from ADC_function import *
from WebCrawler import fanza from WebCrawler.storyline import getStoryline
from WebCrawler import airav
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(html):
soup = BeautifulSoup(htmlcode, 'lxml') actors = html.xpath('//div[@class="star-name"]/a')
a = soup.find_all(attrs={'class': 'star-name'})
d={} d={}
for i in a: for i in actors:
l=i.a['href'] url=i.attrib['href']
t=i.get_text() t=i.attrib['title']
html = etree.fromstring(get_html(l), etree.HTMLParser()) html = etree.fromstring(get_html(url), etree.HTMLParser())
p=urljoin("https://www.javbus.com", p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p} p2={t:p}
d.update(p2) d.update(p2)
return d return d
def getTitle(htmlcode): #获取标题 def getTitle(html): #获取标题
doc = pq(htmlcode) title = str(html.xpath('/html/head/title/text()')[0])
title=str(doc('div.container h3').text()).replace(' ','-') title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
try:
title2 = re.sub('n\d+-','',title)
return title2
except:
return title return title
def getStudio(htmlcode): #获取厂商 已修改 def getStudioJa(html):
html = etree.fromstring(htmlcode,etree.HTMLParser()) x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
# 如果记录中冇导演厂商排在第4位 return str(x[0]) if len(x) else ''
if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): def getStudio(html): #获取厂商
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
# 如果记录中有导演厂商排在第5位 return str(x[0]) if len(x) else ''
elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): def getYear(html): #获取年份
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
else: return result[:4] if len(result)>=len('2000-01-01') else ''
result = '' def getCover(html): #获取封面链接
return result image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
def getYear(htmlcode): #获取年份 return urljoin("https://www.javbus.com", image)
html = etree.fromstring(htmlcode,etree.HTMLParser()) def getRelease(html): #获取出版日期
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result return result
def getCover(htmlcode): #获取封面链接 def getRuntime(html): #获取分钟 已修改
doc = pq(htmlcode)
image = doc('a.bigImage')
return urljoin("https://www.javbus.com", image.attr('href'))
def getRelease(htmlcode): #获取出版日期
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getRuntime(htmlcode): #获取分钟 已修改
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result return result
def getActor(htmlcode): #获取女优 def getActor(html): #获取女优
b=[] b=[]
soup=BeautifulSoup(htmlcode,'lxml') actors = html.xpath('//div[@class="star-name"]/a')
a=soup.find_all(attrs={'class':'star-name'}) for i in actors:
for i in a: b.append(i.attrib['title'])
b.append(i.get_text())
return b return b
def getNum(htmlcode): #获取番号 def getNum(html): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser()) kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") return kwdlist[0]
return result def getDirectorJa(html):
def getDirector(htmlcode): #获取导演 已修改 x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
html = etree.fromstring(htmlcode, etree.HTMLParser()) return str(x[0]) if len(x) else ''
if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): def getDirector(html): #获取导演
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
else: return str(x[0]) if len(x) else ''
result = '' # 记录中有可能没有导演数据 def getCID(html):
return result
def getCID(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
#print(htmlcode)
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string) result = re.sub('/.*?.jpg','',string)
return result return result
def getOutline(number): #获取剧情介绍 def getOutline(number, title): #获取剧情介绍 多进程并发查询
try: return getStoryline(number,title)
response = json.loads(airav.main(number)) def getSeriseJa(html):
result = response['outline'] x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return result return str(x[0]) if len(x) else ''
except: def getSerise(html): #获取系列
return '' x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
def getSerise(htmlcode): #获取系列 已修改 return str(x[0]) if len(x) else ''
html = etree.fromstring(htmlcode, etree.HTMLParser()) def getTag(html): # 获取标签
# 如果记录中冇导演系列排在第6位 klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): taglist = [translateTag_to_sc(v) for v in klist[1:]]
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") return taglist
# 如果记录中有导演系列排在第7位
elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
else:
result = ''
return result
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'genre'})
for i in a:
if 'onmouseout' in str(i) or '多選提交' in str(i):
continue
tag.append(translateTag_to_sc(i.get_text()))
return tag
def getExtrafanart(htmlcode): # 获取剧照 def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>') html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(htmlcode) html = html_pather.search(htmlcode)
@@ -117,32 +79,34 @@ def getExtrafanart(htmlcode): # 获取剧照
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"') extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html) extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs: if extrafanart_imgs:
return extrafanart_imgs return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return '' return ''
def main_uncensored(number): def main_uncensored(number):
htmlcode = get_html('https://www.javbus.com/ja/' + number) htmlcode = get_html('https://www.javbus.com/ja/' + number)
if getTitle(htmlcode) == '': if "<title>404 Page Not Found" in htmlcode:
htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) raise Exception('404 page not found')
lx = etree.fromstring(htmlcode, etree.HTMLParser())
title = getTitle(lx)
dic = { dic = {
'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), 'title': title,
'studio': getStudio(htmlcode), 'studio': getStudioJa(lx),
'year': getYear(htmlcode), 'year': getYear(lx),
'outline': getOutline(number), 'outline': getOutline(number, title),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(lx),
'director': getDirector(htmlcode), 'director': getDirectorJa(lx),
'actor': getActor(htmlcode), 'actor': getActor(lx),
'release': getRelease(htmlcode), 'release': getRelease(lx),
'number': getNum(htmlcode), 'number': getNum(lx),
'cover': getCover(htmlcode), 'cover': getCover(lx),
'tag': getTag(htmlcode), 'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode), 'extrafanart': getExtrafanart(htmlcode),
'label': getSerise(htmlcode), 'label': getSeriseJa(lx),
'imagecut': 0, 'imagecut': 0,
'actor_photo': '', # 'actor_photo': '',
'website': 'https://www.javbus.com/ja/' + number, 'website': 'https://www.javbus.com/ja/' + number,
'source': 'javbus.py', 'source': 'javbus.py',
'series': getSerise(htmlcode), 'series': getSeriseJa(lx),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js return js
@@ -155,32 +119,36 @@ def main(number):
htmlcode = get_html('https://www.fanbus.us/' + number) htmlcode = get_html('https://www.fanbus.us/' + number)
except: except:
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
if "<title>404 Page Not Found" in htmlcode:
raise Exception('404 page not found')
lx = etree.fromstring(htmlcode,etree.HTMLParser())
title = getTitle(lx)
dic = { dic = {
'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), 'title': title,
'studio': getStudio(htmlcode), 'studio': getStudio(lx),
'year': str(re.search('\d{4}', getYear(htmlcode)).group()), 'year': getYear(lx),
'outline': getOutline(number), 'outline': getOutline(number, title),
'runtime': getRuntime(htmlcode), 'runtime': getRuntime(lx),
'director': getDirector(htmlcode), 'director': getDirector(lx),
'actor': getActor(htmlcode), 'actor': getActor(lx),
'release': getRelease(htmlcode), 'release': getRelease(lx),
'number': getNum(htmlcode), 'number': getNum(lx),
'cover': getCover(htmlcode), 'cover': getCover(lx),
'imagecut': 1, 'imagecut': 1,
'tag': getTag(htmlcode), 'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode), 'extrafanart': getExtrafanart(htmlcode),
'label': getSerise(htmlcode), 'label': getSerise(lx),
'actor_photo': getActorPhoto(htmlcode), # 'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.com/' + number, 'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py', 'source': 'javbus.py',
'series': getSerise(htmlcode), 'series': getSerise(lx),
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js return js
except: except:
return main_uncensored(number) return main_uncensored(number)
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
data = { data = {
"title": "", "title": "",
@@ -191,5 +159,13 @@ def main(number):
return js return js
if __name__ == "__main__" : if __name__ == "__main__" :
config.G_conf_override['debug_mode:switch'] = True
print(main('ABP-888'))
print(main('ABP-960'))
print(main('ADV-R0624')) # 404
print(main('MMNT-010'))
print(main('ipx-292')) print(main('ipx-292'))
print(main('CEMD-011')) print(main('CEMD-011'))
print(main('CJOD-278'))
print(main('100221_001'))
print(main('AVSW-061'))

View File

@@ -3,25 +3,22 @@ sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
from WebCrawler import airav from mechanicalsoup.stateful_browser import StatefulBrowser
# import sys from WebCrawler.storyline import getStoryline
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(html):
html = etree.fromstring(a, etree.HTMLParser())
browser_title = str(html.xpath("/html/head/title/text()")[0]) browser_title = str(html.xpath("/html/head/title/text()")[0])
return browser_title[:browser_title.find(' | JavDB')].strip() return browser_title[:browser_title.find(' | JavDB')].strip()
def getActor(a): def getActor(html):
html = etree.fromstring(a, etree.HTMLParser())
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
r = [] r = []
idx = 0 idx = 0
actor_gendor = config.Config().actor_gender() actor_gendor = config.getInstance().actor_gender()
if not actor_gendor in ['female','male','both','all']: if not actor_gendor in ['female','male','both','all']:
actor_gendor = 'female' actor_gendor = 'female'
for act in actors: for act in actors:
@@ -33,8 +30,8 @@ def getActor(a):
idx = idx + 1 idx = idx + 1
return r return r
def getaphoto(url): def getaphoto(url, browser):
html_page = get_html(url) html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)') img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
img_url = img_prether.findall(html_page) img_url = img_prether.findall(html_page)
if img_url: if img_url:
@@ -42,24 +39,18 @@ def getaphoto(url):
else: else:
return '' return ''
def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>') actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
actorall = actorall_prether.findall(html) if not actorall:
return {}
if actorall: a = getActor(html)
actoralls = actorall[0]
actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>')
actor = actor_prether.findall(actoralls)
actor_photo = {} actor_photo = {}
for i in actor: for i in actorall:
actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0]) if i.text in a:
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
return actor_photo return actor_photo
else: def getStudio(a, html):
return {}
def getStudio(a):
# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
@@ -67,23 +58,25 @@ def getStudio(a):
patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>') patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
pianshang = patherr.findall(a) pianshang = patherr.findall(a)
if pianshang: if pianshang:
result = pianshang[0] result = pianshang[0].strip()
else: if len(result):
result = "" return result
# 以卖家作为工作室
try:
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
except:
result = ''
return result return result
def getRuntime(a): def getRuntime(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a): def getLabel(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(a): def getNum(html):
html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+') return str(result2 + result1).strip('+')
@@ -113,8 +106,7 @@ def getRelease(a):
else: else:
result = '' result = ''
return result return result
def getTag(a): def getTag(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
total = [] total = []
@@ -135,11 +127,10 @@ def getTag(a):
pass pass
return total return total
def getCover_small(a, index=0): def getCover_small(html, index=0):
# same issue mentioned below, # same issue mentioned below,
# javdb sometime returns multiple results # javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number # DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result: if not 'https' in result:
@@ -170,66 +161,76 @@ def getTrailer(htmlcode): # 获取预告片
video_url = '' video_url = ''
return video_url return video_url
def getExtrafanart(htmlcode): # 获取剧照 def getExtrafanart(html): # 获取剧照
html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>') result = []
html = html_pather.search(htmlcode) try:
if html: result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
html = html.group() except:
extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"') pass
extrafanart_imgs = extrafanart_pather.findall(html) return result
if extrafanart_imgs: def getCover(html):
return extrafanart_imgs
return ''
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
try: try:
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
except: # 2020.7.17 Repair Cover Url crawl except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
return result return result
def getDirector(a): def getDirector(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number): #获取剧情介绍 def getOutline0(number): #获取剧情介绍 airav.wiki站点404函数暂时更名等无法恢复时删除
try: try:
response = json.loads(airav.main(number)) htmlcode = get_html('https://cn.airav.wiki/video/' + number)
result = response['outline'] from WebCrawler.airav import getOutline as airav_getOutline
result = airav_getOutline(htmlcode)
return result return result
except: except:
pass
return '' return ''
def getSeries(a): def getOutline(number, title): #获取剧情介绍 多进程并发查询
#/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a return getStoryline(number,title)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def main(number): def main(number):
javdb_site = random.choice(["javdb9", "javdb30"]) # javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
# 如果无.json文件或者超过有效期则随机选择一个站点。
javdb_sites = ["javdb31", "javdb32"]
debug = config.getInstance().debug()
try: try:
# if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
# pass # pass
# else: # else:
# number = number.upper() # number = number.upper()
number = number.upper() number = number.upper()
cookie_json = './' + javdb_site + '.json'
javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
# 不加载过期的cookiejavdb登录界面显示为7天免登录故假定cookie有效期为7天 # 不加载过期的cookiejavdb登录界面显示为7天免登录故假定cookie有效期为7天
has_json = False
for cj in javdb_sites:
javdb_site = cj
cookie_json = javdb_site + '.json'
cookies_dict, cookies_filepath = load_cookies(cookie_json) cookies_dict, cookies_filepath = load_cookies(cookie_json)
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
cdays = file_modification_days(cookies_filepath) cdays = file_modification_days(cookies_filepath)
if cdays < 7: if cdays < 7:
javdb_cookies = cookies_dict javdb_cookies = cookies_dict
has_json = True
break
elif cdays != 9999: elif cdays != 9999:
print( print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') if not has_json:
javdb_site = secrets.choice(javdb_sites)
if debug:
print(f'[!]javdb:select site {javdb_site}')
browser = None
try: try:
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
query_result = get_html(javdb_url, cookies=javdb_cookies) res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
if not res.ok:
raise
query_result = res.text
except: except:
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies) query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies)
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
@@ -250,61 +251,74 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
raise ValueError("number not found") raise ValueError("number not found")
correct_url = urls[0] correct_url = urls[0]
try: try:
if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive
detail_page = browser.open_relative(correct_url).text
else:
javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
except: except:
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
# etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用
lx = etree.fromstring(detail_page, etree.HTMLParser())
# no cut image by default # no cut image by default
imagecut = 3 imagecut = 3
# If gray image exists ,then replace with normal cover # If gray image exists ,then replace with normal cover
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
cover_small = getCover_small(query_result) cover_small = getCover_small(html)
else: else:
try: try:
cover_small = getCover_small(query_result, index=ids.index(number)) cover_small = getCover_small(html, index=ids.index(number))
except: except:
# if input number is "STAR438" not "STAR-438", use first search result. # if input number is "STAR438" not "STAR-438", use first search result.
cover_small = getCover_small(query_result) cover_small = getCover_small(html)
if 'placeholder' in cover_small: if 'placeholder' in cover_small:
# replace wit normal cover and cut it # replace wit normal cover and cut it
imagecut = 1 imagecut = 1
cover_small = getCover(detail_page) cover_small = getCover(lx)
dp_number = getNum(detail_page) dp_number = getNum(lx)
if dp_number.upper() != number: if dp_number.upper() != number:
raise ValueError("number not found") raise ValueError("number not found")
title = getTitle(detail_page) title = getTitle(lx)
if title and dp_number: if title and dp_number:
number = dp_number number = dp_number
# remove duplicate title # remove duplicate title
title = title.replace(number, '').strip() title = title.replace(number, '').strip()
dic = { dic = {
'actor': getActor(detail_page), 'actor': getActor(lx),
'title': title, 'title': title,
'studio': getStudio(detail_page), 'studio': getStudio(detail_page, lx),
'outline': getOutline(number), 'outline': getOutline(number, title),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(lx),
'director': getDirector(detail_page), 'director': getDirector(lx),
'release': getRelease(detail_page), 'release': getRelease(detail_page),
'number': number, 'number': number,
'cover': getCover(detail_page), 'cover': getCover(lx),
'cover_small': cover_small, 'cover_small': cover_small,
'trailer': getTrailer(detail_page), 'trailer': getTrailer(detail_page),
'extrafanart': getExtrafanart(detail_page), 'extrafanart': getExtrafanart(lx),
'imagecut': imagecut, 'imagecut': imagecut,
'tag': getTag(detail_page), 'tag': getTag(lx),
'label': getLabel(detail_page), 'label': getLabel(lx),
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(detail_page), # 'actor_photo': getActorPhoto(lx, javdb_site, browser),
'website': 'https://javdb.com' + correct_url, 'website': 'https://javdb.com' + correct_url,
'source': 'javdb.py', 'source': 'javdb.py',
'series': getSeries(detail_page), 'series': getSeries(lx),
} }
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人')
if not dic['series']:
dic['series'] = dic['studio']
if not dic['label']:
dic['label'] = dic['studio']
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -313,10 +327,18 @@ f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not b
# main('DV-1562') # main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__": if __name__ == "__main__":
config.G_conf_override['debug_mode:switch'] = True
# print(main('blacked.20.05.30')) # print(main('blacked.20.05.30'))
# print(main('AGAV-042')) # print(main('AGAV-042'))
# print(main('BANK-022')) # print(main('BANK-022'))
print(main('FC2-735670')) # print(main('070116-197'))
print(main('FC2-1174949')) # not found # print(main('093021_539')) # 没有剧照 片商pacopacomama
# print(main('FC2-2278260'))
# print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found
print(main('MVSD-439')) print(main('MVSD-439'))
print(main('EHM0001')) # not found # print(main('EHM0001')) # not found
# print(main('FC2-2314275'))
# print(main('EBOD-646'))
# print(main('LOVE-262'))
print(main('ABP-890'))

View File

@@ -137,7 +137,7 @@ def main(number2):
'series': getSeries(a), 'series': getSeries(a),
} }
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}

334
WebCrawler/storyline.py Normal file
View File

@@ -0,0 +1,334 @@
import sys
sys.path.append('../')
import re
import json
import builtins
from ADC_function import *
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher
from unicodedata import category
from number_parser import is_uncensored
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池')
class noThread(object):
def map(self, fn, param):
return builtins.map(fn, param)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None):
start_time = time.time()
conf = config.getInstance()
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
if is_uncensored(number):
storyine_sites += conf.storyline_uncensored_site().split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
r_dup = set()
apply_sites = []
for s in storyine_sites:
if s in G_registered_storyline_site and s not in r_dup:
apply_sites.append(s)
r_dup.add(s)
mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = conf.storyline_mode()
assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
result = pool.map(getStoryline_mp, mp_args)
result = list(result) if run_mode == 0 else result
if not debug and conf.storyline_show() == 0:
for value in result:
if isinstance(value, str) and len(value):
return value
return ''
# 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示
cnt = len(apply_sites)
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
first = True
sel = ''
for i in range(cnt):
sl = len(result[i])if isinstance(result[i], str) else 0
if sl and first:
s += f'[选中{apply_sites[i]}字数:{sl}]'
first = False
sel = result[i]
elif sl:
s += f'{apply_sites[i]}字数:{sl}'
else:
s += f'{apply_sites[i]}:空'
print(s)
return sel
def getStoryline_mp(args):
return _getStoryline_mp(*args)
# 注新进程的print()不会写入日志中将来调试修复失效数据源需直接查看标准输出issue信息需截图屏幕
def _getStoryline_mp(site, number, title, debug):
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
try:
number_up = number
site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, browser = get_html_by_browser(url, return_type='browser')
if not res.ok:
raise ValueError(f"get_html_by_browser('{url}') failed")
avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div')
if number_up not in avs.select_one('a > h3').text.upper():
raise ValueError("number not found")
detail_url = avs.select_one('a')['href']
res = browser.open_relative(detail_url)
if not res.ok:
raise ValueError(f"browser.open_relative('{detail_url}') failed")
t = browser.page.select_one('head > title').text
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
if number.upper() != airav_number:
raise ValueError(f"page number ->[{airav_number}] not match")
desc = browser.page.select_one('li.introduction > span').text.strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP getOutline_amazon Error: {e},number [{number}].")
pass
return None
def getStoryline_58avgo(number, debug):
try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
'?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
return_type = 'browser')
if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
if f'searchresults.aspx?Search={kwd}' not in browser.url:
raise ValueError("number not found")
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
link = None
for i in range(len(s)):
title = s[i].h3.text.strip()
if re.search(number, title, re.I):
link = s[i]
break;
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or 'playon.aspx' not in browser.url:
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]")
return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number},
return_type = 'browser')
if not result.ok:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
s = browser.page.select('div.type_movie > div > ul > li > div')
for i in range(len(s)):
title = s[i].a.h3.text.strip()
page_number = title[title.rfind(' '):].strip()
if re.search(number, page_number, re.I):
return s[i]['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
try:
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
pass
return ''
def getStoryline_amazon(q_title, number, debug):
if not isinstance(q_title, str) or not len(q_title):
return None
try:
amazon_cookie, _ = load_cookies('amazon.json')
cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
url = "https://www.amazon.co.jp/s?k=" + q_title
res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
if not res.ok:
raise ValueError("get_html_by_browser() failed")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
if isinstance(lks, list) and len(lks):
browser.follow_link(lks[0])
cookie = None
html = etree.fromstring(str(browser.page), etree.HTMLParser())
titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
if not isinstance(idx, int) or idx < 0:
raise ValueError("title and number not found")
furl = urls[idx]
r = browser.open_relative(furl)
if not r.ok:
raise ValueError("browser.open_relative()) failed.")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
if isinstance(lks, list) and len(lks):
browser.follow_link(lks[0])
cookie = None
ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
ama_t = re.sub(r'審査番号:\d+', '', ama_t)
if cookie is None:
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
ama_save = Path.home() / ".local/share/avdc/amazon.json"
ama_save.parent.mkdir(parents=True, exist_ok=True)
ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
return ama_t
except Exception as e:
if debug:
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
pass
return None
# 查货架中DVD和蓝光商品中标题相似度高的
def amazon_select_one(a_titles, q_title, number, debug):
sel = -1
ratio = 0
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
for loc in range(len(a_titles)):
t = a_titles[loc]
if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
return loc
if not re.search('DVD|Blu-ray', t, re.I):
continue
ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
findlen = 0
lastpos = -1
cnt = len(ama_t)
for c in reversed(ama_t):
cnt -= 1
pos = que_t.rfind(c)
if lastpos >= 0:
pos_near = que_t[:lastpos].rfind(c)
if pos_near < 0:
findlen = 0
lastpos = -1
ama_t = ama_t[:cnt+1]
else:
pos = pos_near
if pos < 0:
if category(c) == 'Nd':
return -1
ama_t = ama_t[:cnt]
findlen = 0
lastpos = -1
continue
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
findlen += 1
lastpos = pos
if findlen >= 4:
break
continue
findlen = 1
lastpos = pos
if findlen==0:
return -1
r = SequenceMatcher(None, ama_t, que_t).ratio()
if r > ratio:
sel = loc
ratio = r
save_t_ = ama_t
if ratio > 0.999:
break
if ratio < 0.5:
return -1
if not debug:
# 目前采信相似度高于0.9的结果
return sel if ratio >= 0.9 else -1
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
(Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1
# 被采信的结果日志
(Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel

View File

@@ -3,16 +3,12 @@ sys.path.append('../')
import re import re
from lxml import etree from lxml import etree
import json import json
from bs4 import BeautifulSoup
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline
# import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a): def getTitle(html):
html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath('//*[@id="program_detail_title"]/text()')[0] result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
return result return result
@@ -43,8 +39,7 @@ def getActorPhoto(browser):
return o return o
def getStudio(a): def getStudio(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
except: except:
@@ -52,20 +47,14 @@ def getStudio(a):
return result.strip('+').replace("', '", '').replace('"', '') return result.strip('+').replace("', '", '').replace('"', '')
def getRuntime(a): def getRuntime(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0] x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
except: return x
return ''
try:
return re.findall('\d+',result1)[0]
except: except:
return '' return ''
def getLabel(html):
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
return result return result
@@ -73,8 +62,7 @@ def getLabel(a):
return '' return ''
def getNum(a): def getNum(html):
html = etree.fromstring(a, etree.HTMLParser())
try: try:
result = html.xpath('//*[@id="hinban"]/text()')[0] result = html.xpath('//*[@id="hinban"]/text()')[0]
return result return result
@@ -90,8 +78,7 @@ def getYear(getRelease):
return getRelease return getRelease
def getRelease(a): def getRelease(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
except: except:
@@ -102,31 +89,22 @@ def getRelease(a):
return '' return ''
def getTag(a): def getTag(html):
result2=[] x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else []
result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
for i in result1:
i=i.replace(u'\n','')
i=i.replace(u'\t','')
if len(i):
result2.append(i)
return result2
def getCover_small(a, index=0): def getCover_small(html, index=0):
# same issue mentioned below, # same issue mentioned below,
# javdb sometime returns multiple results # javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number # DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result: if not 'https' in result:
result = 'https:' + result result = 'https:' + result
return result return result
def getCover(htmlcode): def getCover(html):
html = etree.fromstring(htmlcode, etree.HTMLParser())
try: try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
return 'https:' + result return 'https:' + result
@@ -134,8 +112,7 @@ def getCover(htmlcode):
return '' return ''
def getDirector(a): def getDirector(html):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
return result return result
@@ -143,19 +120,21 @@ def getDirector(a):
return '' return ''
def getOutline(htmlcode): def getOutline(html, number, title):
html = etree.fromstring(htmlcode, etree.HTMLParser()) storyline_site = config.getInstance().storyline_site().split(',')
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site)
if len(g):
return g
try: try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0] x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
return x.replace(getNum(html), '')
except: except:
return '' return ''
try:
return re.sub('\\\\\w*\d+','',result)
except:
return result
def getSeries(htmlcode): def getSeries(html):
html = etree.fromstring(htmlcode, etree.HTMLParser())
try: try:
try: try:
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
@@ -181,11 +160,10 @@ def getExtrafanart(htmlcode): # 获取剧照
return s return s
return '' return ''
def main(number): def open_by_browser(number):
try:
xcity_number = number.replace('-','') xcity_number = number.replace('-','')
query_result, browser = get_html_by_form( query_result, browser = get_html_by_form(
'https://xcity.jp/about/', 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()}, fields = {'q' : xcity_number.lower()},
return_type = 'browser') return_type = 'browser')
if not query_result or not query_result.ok: if not query_result or not query_result.ok:
@@ -193,38 +171,44 @@ def main(number):
result = browser.follow_link(browser.links('avod\/detail')[0]) result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok: if not result.ok:
raise ValueError("xcity.py: detail page not found") raise ValueError("xcity.py: detail page not found")
detail_page = str(browser.page) return str(browser.page), browser
def main(number):
try:
detail_page, browser = open_by_browser(number)
url = browser.url url = browser.url
newnum = getNum(detail_page).upper() lx = etree.fromstring(detail_page, etree.HTMLParser())
newnum = getNum(lx).upper()
number_up = number.upper() number_up = number.upper()
if newnum != number_up: if newnum != number_up:
if newnum == xcity_number.upper(): if newnum == number.replace('-','').upper():
newnum = number_up newnum = number_up
else: else:
raise ValueError("xcity.py: number not found") raise ValueError("xcity.py: number not found")
title = getTitle(lx)
dic = { dic = {
'actor': getActor(browser), 'actor': getActor(browser),
'title': getTitle(detail_page), 'title': title,
'studio': getStudio(detail_page), 'studio': getStudio(lx),
'outline': getOutline(detail_page), 'outline': getOutline(lx, number, title),
'runtime': getRuntime(detail_page), 'runtime': getRuntime(lx),
'director': getDirector(detail_page), 'director': getDirector(lx),
'release': getRelease(detail_page), 'release': getRelease(lx),
'number': newnum, 'number': newnum,
'cover': getCover(detail_page), 'cover': getCover(lx),
'cover_small': '', 'cover_small': '',
'extrafanart': getExtrafanart(detail_page), 'extrafanart': getExtrafanart(detail_page),
'imagecut': 1, 'imagecut': 1,
'tag': getTag(detail_page), 'tag': getTag(lx),
'label': getLabel(detail_page), 'label': getLabel(lx),
'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()),
# 'actor_photo': getActorPhoto(browser), # 'actor_photo': getActorPhoto(browser),
'website': url, 'website': url,
'source': 'xcity.py', 'source': 'xcity.py',
'series': getSeries(detail_page), 'series': getSeries(lx),
} }
except Exception as e: except Exception as e:
if config.Config().debug(): if config.getInstance().debug():
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}

View File

@@ -1,12 +1,13 @@
[common] [common]
main_mode=1 main_mode=1
source_folder=./
failed_output_folder=failed failed_output_folder=failed
success_output_folder=JAV_output success_output_folder=JAV_output
soft_link=0 soft_link=0
failed_move=1 failed_move=1
auto_exit=0 auto_exit=0
transalte_to_sc=0 transalte_to_sc=0
multi_threading=1 multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female actor_gender=female
del_empty_folder=1 del_empty_folder=1
@@ -16,6 +17,8 @@ nfo_skip_days=30
; 处理完多少个视频文件后停止0为处理所有视频文件 ; 处理完多少个视频文件后停止0为处理所有视频文件
stop_counter=0 stop_counter=0
; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁 ; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
ignore_failed_list=0
download_only_missing_images=1
[proxy] [proxy]
;proxytype: http or socks5 or socks5h switch: 0 1 ;proxytype: http or socks5 or socks5h switch: 0 1
@@ -62,8 +65,7 @@ switch=0
; 用来确定是否是无码 ; 用来确定是否是无码
[uncensored] [uncensored]
uncensored_prefix=S2M,BT,LAF,SMD uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,RED
[media] [media]
; 影片后缀 ; 影片后缀
@@ -82,3 +84,20 @@ water=2
switch=0 switch=0
extrafanart_folder=extrafanart extrafanart_folder=extrafanart
; 剧情简介
[storyline]
; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。
; 其中airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1有码无码都能查58avgo只能查无码或者
; 流出破解马赛克的影片(此功能没使用)。
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 设置成不查询可大幅提高刮削速度。
; site=
site=avno1
censored_site=airav,xcity,amazon
uncensored_site=58avgo
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
run_mode=1
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
show_result=0

191
config.py
View File

@@ -1,33 +1,82 @@
import os import os
import re
import sys import sys
import configparser import configparser
import codecs
from pathlib import Path from pathlib import Path
G_conf_override = {
# index 0 save Config() first instance for quick access by using getInstance()
0 : None,
# register override config items
"common:main_mode" : None,
"common:source_folder" : None,
"common:auto_exit" : None,
"common:nfo_skip_days" : None,
"common:stop_counter" : None,
"common:ignore_failed_list" : None,
"debug_mode:switch" : None
}
def getInstance():
if isinstance(G_conf_override[0], Config):
return G_conf_override[0]
return Config()
class Config: class Config:
def __init__(self, path: str = "config.ini"): def __init__(self, path: str = "config.ini"):
path_search_order = [ path_search_order = (
path, Path(path),
"./config.ini", Path.cwd() / "config.ini",
os.path.join(Path.home(), "avdc.ini"), Path.home() / "avdc.ini",
os.path.join(Path.home(), ".avdc.ini"), Path.home() / ".avdc.ini",
os.path.join(Path.home(), ".avdc/config.ini"), Path.home() / ".avdc/config.ini",
os.path.join(Path.home(), ".config/avdc/config.ini") Path.home() / ".config/avdc/config.ini"
] )
ini_path = None ini_path = None
for p in path_search_order: for p in path_search_order:
if os.path.isfile(p): if p.is_file():
ini_path = p ini_path = p.resolve()
break break
if ini_path: if ini_path:
self.conf = configparser.ConfigParser() self.conf = configparser.ConfigParser()
self.ini_path = ini_path
try: try:
self.conf.read(ini_path, encoding="utf-8-sig") if self.conf.read(ini_path, encoding="utf-8-sig"):
if G_conf_override[0] is None:
G_conf_override[0] = self
except: except:
self.conf.read(ini_path, encoding="utf-8") if self.conf.read(ini_path, encoding="utf-8"):
if G_conf_override[0] is None:
G_conf_override[0] = self
else: else:
print("[-]Config file not found!") print("ERROR: Config file not found!")
print("Please put config file into one of the following path:")
print('\n'.join([str(p.resolve()) for p in path_search_order[2:]]))
# 对于找不到配置文件的情况,还是在打包时附上对应版本的默认配置文件,有需要时为其在搜索路径中生成,
# 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来,单个执行文件就是功能完整的了,放在任何
# 执行路径下都可以放心使用。
res_path = None
# pyinstaller打包的在打包中找config.ini
if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file():
res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini'
# 脚本运行的所在位置找
elif (Path(__file__).resolve().parent / 'config.ini').is_file():
res_path = Path(__file__).resolve().parent / 'config.ini'
if res_path is None:
sys.exit(2) sys.exit(2)
ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:")
if re.search('n', ins, re.I):
sys.exit(2)
# 用户目录才确定具有写权限,因此选择 ~/avdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的
# 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。
write_path = path_search_order[2] # Path.home() / "avdc.ini"
write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8')
print("Config file '{}' created.".format(write_path.resolve()))
input("Press Enter key exit...")
sys.exit(0)
# self.conf = self._default_config() # self.conf = self._default_config()
# try: # try:
# self.conf = configparser.ConfigParser() # self.conf = configparser.ConfigParser()
@@ -40,13 +89,24 @@ class Config:
# print("[-]",e) # print("[-]",e)
# sys.exit(3) # sys.exit(3)
# #self.conf = self._default_config() # #self.conf = self._default_config()
def getboolean_override(self, section, item) -> bool:
return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"])
def main_mode(self) -> str: def getint_override(self, section, item) -> int:
return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"])
def get_override(self, section, item) -> str:
return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"])
def main_mode(self) -> int:
try: try:
return self.conf.getint("common", "main_mode") return self.getint_override("common", "main_mode")
except ValueError: except ValueError:
self._exit("common:main_mode") self._exit("common:main_mode")
def source_folder(self) -> str:
return self.get_override("common", "source_folder")
def failed_folder(self) -> str: def failed_folder(self) -> str:
return self.conf.get("common", "failed_output_folder") return self.conf.get("common", "failed_output_folder")
@@ -61,7 +121,7 @@ class Config:
def failed_move(self) -> bool: def failed_move(self) -> bool:
return self.conf.getboolean("common", "failed_move") return self.conf.getboolean("common", "failed_move")
def auto_exit(self) -> bool: def auto_exit(self) -> bool:
return self.conf.getboolean("common", "auto_exit") return self.getboolean_override("common", "auto_exit")
def transalte_to_sc(self) -> bool: def transalte_to_sc(self) -> bool:
return self.conf.getboolean("common", "transalte_to_sc") return self.conf.getboolean("common", "transalte_to_sc")
def multi_threading(self) -> bool: def multi_threading(self) -> bool:
@@ -70,14 +130,18 @@ class Config:
return self.conf.getboolean("common", "del_empty_folder") return self.conf.getboolean("common", "del_empty_folder")
def nfo_skip_days(self) -> int: def nfo_skip_days(self) -> int:
try: try:
return self.conf.getint("common", "nfo_skip_days") return self.getint_override("common", "nfo_skip_days")
except: except:
return 30 return 30
def stop_counter(self) -> int: def stop_counter(self) -> int:
try: try:
return self.conf.getint("common", "stop_counter") return self.getint_override("common", "stop_counter")
except: except:
return 0 return 0
def ignore_failed_list(self) -> bool:
return self.getboolean_override("common", "ignore_failed_list")
def download_only_missing_images(self) -> bool:
return self.conf.getboolean("common", "download_only_missing_images")
def is_transalte(self) -> bool: def is_transalte(self) -> bool:
return self.conf.getboolean("transalte", "switch") return self.conf.getboolean("transalte", "switch")
def is_trailer(self) -> bool: def is_trailer(self) -> bool:
@@ -173,7 +237,39 @@ class Config:
return self.conf.get("escape", "folders") return self.conf.get("escape", "folders")
def debug(self) -> bool: def debug(self) -> bool:
return self.conf.getboolean("debug_mode", "switch") return self.getboolean_override("debug_mode", "switch")
def storyline_site(self) -> str:
try:
return self.conf.get("storyline", "site")
except:
return "avno1"
def storyline_censored_site(self) -> str:
try:
return self.conf.get("storyline", "censored_site")
except:
return "airav,xcity,amazon"
def storyline_uncensored_site(self) -> str:
try:
return self.conf.get("storyline", "uncensored_site")
except:
return "58avgo"
def storyline_show(self) -> int:
try:
v = self.conf.getint("storyline", "show_result")
return v if v in (0,1,2) else 2 if v > 2 else 0
except:
return 0
def storyline_mode(self) -> int:
try:
v = self.conf.getint("storyline", "run_mode")
return v if v in (0,1,2) else 2 if v > 2 else 0
except:
return 1
@staticmethod @staticmethod
def _exit(sec: str) -> None: def _exit(sec: str) -> None:
@@ -188,6 +284,7 @@ class Config:
sec1 = "common" sec1 = "common"
conf.add_section(sec1) conf.add_section(sec1)
conf.set(sec1, "main_mode", "1") conf.set(sec1, "main_mode", "1")
conf.set(sec1, "source_folder", "./")
conf.set(sec1, "failed_output_folder", "failed") conf.set(sec1, "failed_output_folder", "failed")
conf.set(sec1, "success_output_folder", "JAV_output") conf.set(sec1, "success_output_folder", "JAV_output")
conf.set(sec1, "soft_link", "0") conf.set(sec1, "soft_link", "0")
@@ -199,6 +296,8 @@ class Config:
conf.set(sec1, "del_empty_folder", "1") conf.set(sec1, "del_empty_folder", "1")
conf.set(sec1, "nfo_skip_days", 30) conf.set(sec1, "nfo_skip_days", 30)
conf.set(sec1, "stop_counter", 0) conf.set(sec1, "stop_counter", 0)
conf.set(sec1, "ignore_failed_list", 0)
conf.set(sec1, "download_only_missing_images", 1)
sec2 = "proxy" sec2 = "proxy"
conf.add_section(sec2) conf.add_section(sec2)
@@ -265,6 +364,14 @@ class Config:
conf.set(sec13, "switch", 1) conf.set(sec13, "switch", 1)
conf.set(sec13, "extrafanart_folder", "extrafanart") conf.set(sec13, "extrafanart_folder", "extrafanart")
sec14 = "storyline"
conf.add_section(sec14)
conf.set(sec14, "site", "avno1")
conf.set(sec14, "censored_site", "airav,xcity,amazon")
conf.set(sec14, "uncensored_site", "58avgo")
conf.set(sec14, "show_result", 0)
conf.set(sec14, "run_mode", 1)
return conf return conf
@@ -308,9 +415,45 @@ if __name__ == "__main__":
code = compile(evstr, "<string>", "eval") code = compile(evstr, "<string>", "eval")
print('{}: "{}"'.format(evstr, eval(code))) print('{}: "{}"'.format(evstr, eval(code)))
config = Config() config = Config()
mfilter = ('conf', 'proxy', '_exit', '_default_config') mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'}
for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]: for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
evprint(f'config.{_m}()') evprint(f'config.{_m}()')
pfilter = ('proxies', 'SUPPORT_PROXY_TYPE') pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}
for _p in [p for p in dir(config.proxy()) if not p.startswith('__') and p not in pfilter]: # test getInstance()
evprint(f'config.proxy().{_p}') assert(getInstance() == config)
for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:
evprint(f'getInstance().proxy().{_p}')
# Override Test
G_conf_override["common:nfo_skip_days"] = 4321
G_conf_override["common:stop_counter"] = 1234
assert config.nfo_skip_days() == 4321
assert getInstance().stop_counter() == 1234
# remove override
G_conf_override["common:stop_counter"] = None
G_conf_override["common:nfo_skip_days"] = None
assert config.nfo_skip_days() != 4321
assert config.stop_counter() != 1234
# Create new instance
conf2 = Config()
assert getInstance() != conf2
assert getInstance() == config
G_conf_override["common:main_mode"] = 9
G_conf_override["common:source_folder"] = "A:/b/c"
# Override effect to all instances
assert config.main_mode() == 9
assert conf2.main_mode() == 9
assert getInstance().main_mode() == 9
assert conf2.source_folder() == "A:/b/c"
print("### Override Test ###".center(36))
evprint('getInstance().main_mode()')
evprint('config.source_folder()')
G_conf_override["common:main_mode"] = None
evprint('conf2.main_mode()')
evprint('config.main_mode()')
# unregister key acess will raise except
try:
print(G_conf_override["common:actor_gender"])
except KeyError as ke:
print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr)
print(f"Load Config file '{conf2.ini_path}'.")

263
core.py
View File

@@ -3,8 +3,6 @@ import os.path
import pathlib import pathlib
import re import re
import shutil import shutil
import platform
import errno
import sys import sys
from PIL import Image from PIL import Image
@@ -14,7 +12,7 @@ from datetime import datetime
from ADC_function import * from ADC_function import *
from WebCrawler import get_data_from_json from WebCrawler import get_data_from_json
from number_parser import is_uncensored
def escape_path(path, escape_literals: str): # Remove escape literals def escape_path(path, escape_literals: str): # Remove escape literals
backslash = '\\' backslash = '\\'
@@ -23,7 +21,8 @@ def escape_path(path, escape_literals: str): # Remove escape literals
return path return path
def moveFailedFolder(filepath, conf): def moveFailedFolder(filepath):
conf = config.getInstance()
failed_folder = conf.failed_folder() failed_folder = conf.failed_folder()
soft_link = conf.soft_link() soft_link = conf.soft_link()
# 模式3或软连接改为维护一个失败列表启动扫描时加载用于排除该路径以免反复处理 # 模式3或软连接改为维护一个失败列表启动扫描时加载用于排除该路径以免反复处理
@@ -33,7 +32,6 @@ def moveFailedFolder(filepath, conf):
print("[-]Add to Failed List file, see '%s'" % ftxt) print("[-]Add to Failed List file, see '%s'" % ftxt)
with open(ftxt, 'a', encoding='utf-8') as flt: with open(ftxt, 'a', encoding='utf-8') as flt:
flt.write(f'{filepath}\n') flt.write(f'{filepath}\n')
flt.close()
elif conf.failed_move() and not soft_link: elif conf.failed_move() and not soft_link:
failed_name = os.path.join(failed_folder, os.path.basename(filepath)) failed_name = os.path.join(failed_folder, os.path.basename(filepath))
mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt')) mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt'))
@@ -41,8 +39,13 @@ def moveFailedFolder(filepath, conf):
with open(mtxt, 'a', encoding='utf-8') as wwibbmt: with open(mtxt, 'a', encoding='utf-8') as wwibbmt:
tmstr = datetime.now().strftime("%Y-%m-%d %H:%M") tmstr = datetime.now().strftime("%Y-%m-%d %H:%M")
wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n') wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n')
wwibbmt.close() try:
if os.path.exists(failed_name):
print('[-]File Exists while moving to FailedFolder')
return
shutil.move(filepath, failed_name) shutil.move(filepath, failed_name)
except:
print('[-]File Moving to FailedFolder unsuccessful!')
def get_info(json_data): # 返回json里的数据 def get_info(json_data): # 返回json里的数据
@@ -63,14 +66,15 @@ def get_info(json_data): # 返回json里的数据
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label
def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config.Config, filepath): def small_cover_check(path, number, cover_small, leak_word, c_word, filepath):
filename = f"{number}{leak_word}{c_word}-poster.jpg" filename = f"{number}{leak_word}{c_word}-poster.jpg"
download_file_with_filename(cover_small, filename, path, conf, filepath) download_file_with_filename(cover_small, filename, path, filepath)
print('[+]Image Downloaded! ' + os.path.join(path, filename)) print('[+]Image Downloaded! ' + os.path.join(path, filename))
def create_folder(json_data, conf: config.Config): # 创建文件夹 def create_folder(json_data): # 创建文件夹
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
conf = config.getInstance()
success_folder = conf.success_folder() success_folder = conf.success_folder()
actor = json_data.get('actor') actor = json_data.get('actor')
location_rule = eval(conf.location_rule(), json_data) location_rule = eval(conf.location_rule(), json_data)
@@ -81,35 +85,40 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹
if 'title' in conf.location_rule() and len(title) > maxlen: if 'title' in conf.location_rule() and len(title) > maxlen:
shorttitle = title[0:maxlen] shorttitle = title[0:maxlen]
location_rule = location_rule.replace(title, shorttitle) location_rule = location_rule.replace(title, shorttitle)
# 当演员为空时location_rule被计算为'/number'绝对路径,导致路径连接忽略第一个路径参数,因此添加./使其始终为相对路径
path = os.path.join(success_folder, location_rule).strip() path = os.path.join(success_folder, f'./{location_rule.strip()}')
if not os.path.isdir(path): if not os.path.exists(path):
path = escape_path(path, conf.escape_literals()) path = escape_path(path, conf.escape_literals())
try: try:
os.makedirs(path) os.makedirs(path)
if not os.path.isdir(path):
raise
except: except:
path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number") path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number")
path = escape_path(path, conf.escape_literals()) path = escape_path(path, conf.escape_literals())
try:
os.makedirs(path) os.makedirs(path)
return path except:
print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
return os.path.normpath(path)
# =====================资源下载部分=========================== # =====================资源下载部分===========================
# path = examle:photo , video.in the Project Folder! # path = examle:photo , video.in the Project Folder!
def download_file_with_filename(url, filename, path, conf: config.Config, filepath): def download_file_with_filename(url, filename, path, filepath):
conf = config.getInstance()
configProxy = conf.proxy() configProxy = conf.proxy()
for i in range(configProxy.retry): for i in range(configProxy.retry):
try: try:
if configProxy.enable: if configProxy.enable:
if not os.path.isdir(path): if not os.path.exists(path):
try:
os.makedirs(path) os.makedirs(path)
if not os.path.isdir(path): except:
raise IOError print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
proxies = configProxy.proxies() proxies = configProxy.proxies()
headers = { headers = {
'User-Agent': G_USER_AGENT} 'User-Agent': G_USER_AGENT}
@@ -121,10 +130,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
code.write(r.content) code.write(r.content)
return return
else: else:
if not os.path.isdir(path): if not os.path.exists(path):
try:
os.makedirs(path) os.makedirs(path)
if not os.path.isdir(path): except:
raise IOError print(f"[-]Fatal error! Can not make folder '{path}'")
sys.exit(0)
headers = { headers = {
'User-Agent': G_USER_AGENT} 'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers) r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -148,46 +159,50 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
print('[-]Image Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) print('[-]Image Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
except IOError: except IOError:
print(f"[-]Create Directory '{path}' failed!") print(f"[-]Create Directory '{path}' failed!")
moveFailedFolder(filepath, conf) moveFailedFolder(filepath)
return return
print('[-]Connect Failed! Please check your Proxy or Network!') print('[-]Connect Failed! Please check your Proxy or Network!')
moveFailedFolder(filepath, conf) moveFailedFolder(filepath)
return return
def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: config.Config): def trailer_download(trailer, leak_word, c_word, number, path, filepath):
if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) == 'failed': if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) == 'failed':
return return
configProxy = conf.proxy() configProxy = config.getInstance().proxy()
for i in range(configProxy.retry): for i in range(configProxy.retry):
if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0: if file_not_exist_or_empty(path+'/' + number + leak_word + c_word + '-trailer.mp4'):
print('[!]Video Download Failed! Trying again. [{}/3]', i + 1) print('[!]Video Download Failed! Trying again. [{}/3]', i + 1)
download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath)
continue continue
else: else:
break break
if os.path.getsize(path + '/' + number + leak_word + c_word + '-trailer.mp4') == 0: if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + '-trailer.mp4'):
return return
print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4') print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4')
# 剧照下载成功否则移动到failed # 剧照下载成功否则移动到failed
def extrafanart_download(data, path, conf: config.Config, filepath): def extrafanart_download(data, path, filepath):
j = 1 j = 1
conf = config.getInstance()
path = os.path.join(path, conf.get_extrafanart()) path = os.path.join(path, conf.get_extrafanart())
configProxy = conf.proxy()
download_only_missing_images = conf.download_only_missing_images()
for url in data: for url in data:
jpg_filename = f'extrafanart-{j}.jpg' jpg_filename = f'extrafanart-{j}.jpg'
jpg_fullpath = os.path.join(path, jpg_filename) jpg_fullpath = os.path.join(path, jpg_filename)
if download_file_with_filename(url, jpg_filename, path, conf, filepath) == 'failed': if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
moveFailedFolder(filepath, conf) continue
if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed':
moveFailedFolder(filepath)
return return
configProxy = conf.proxy()
for i in range(configProxy.retry): for i in range(configProxy.retry):
if os.path.getsize(jpg_fullpath) == 0: if file_not_exist_or_empty(jpg_fullpath):
print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
download_file_with_filename(url, jpg_filename, path, conf, filepath) download_file_with_filename(url, jpg_filename, path, filepath)
continue continue
else: else:
break break
if os.path.getsize(jpg_fullpath) == 0: if file_not_exist_or_empty(jpg_fullpath):
return return
print('[+]Image Downloaded!', jpg_fullpath) print('[+]Image Downloaded!', jpg_fullpath)
j += 1 j += 1
@@ -195,39 +210,46 @@ def extrafanart_download(data, path, conf: config.Config, filepath):
# 封面是否下载成功否则移动到failed # 封面是否下载成功否则移动到failed
def image_download(cover, number, leak_word, c_word, path, conf: config.Config, filepath): def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg" filename = f"{number}{leak_word}{c_word}-fanart.jpg"
full_filepath = os.path.join(path, filename) full_filepath = os.path.join(path, filename)
if download_file_with_filename(cover, filename, path, conf, filepath) == 'failed': if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
moveFailedFolder(filepath, conf) return
if download_file_with_filename(cover, filename, path, filepath) == 'failed':
moveFailedFolder(filepath)
return return
configProxy = conf.proxy() configProxy = config.getInstance().proxy()
for i in range(configProxy.retry): for i in range(configProxy.retry):
if os.path.getsize(full_filepath) == 0: if file_not_exist_or_empty(full_filepath):
print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
download_file_with_filename(cover, filename, path, conf, filepath) download_file_with_filename(cover, filename, path, filepath)
continue continue
else: else:
break break
if os.path.getsize(full_filepath) == 0: if file_not_exist_or_empty(full_filepath):
return return
print('[+]Image Downloaded!', full_filepath) print('[+]Image Downloaded!', full_filepath)
shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")) shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg"))
def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf): def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored):
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
failed_folder = conf.failed_folder() if config.getInstance().main_mode() == 3: # 模式3下由于视频文件不做任何改变.nfo文件必须和视频文件名称除后缀外完全一致KODI等软件方可支持
if conf.main_mode() == 3: # 模式3下由于视频文件不做任何改变.nfo文件必须和视频文件名称除后缀外完全一致KODI等软件方可支持
nfo_path = str(Path(filepath).with_suffix('.nfo')) nfo_path = str(Path(filepath).with_suffix('.nfo'))
else: else:
nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo") nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo")
try: try:
if not os.path.isdir(path): if not os.path.exists(path):
try:
os.makedirs(path) os.makedirs(path)
if not os.path.isdir(path): except:
raise IOError print(f"[-]Fatal error! can not make folder '{path}'")
sys.exit(0)
# KODI内查看影片信息时找不到number配置naming_rule=number+'#'+title虽可解决
# 但使得标题太长放入时常为空的outline内会更适合软件给outline留出的显示版面也较大
outline = f"{number}#{outline}"
with open(nfo_path, "wt", encoding='UTF-8') as code: with open(nfo_path, "wt", encoding='UTF-8') as code:
print('<?xml version="1.0" encoding="UTF-8" ?>', file=code) print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
print("<movie>", file=code) print("<movie>", file=code)
@@ -279,7 +301,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
print(" <num>" + number + "</num>", file=code) print(" <num>" + number + "</num>", file=code)
print(" <premiered>" + release + "</premiered>", file=code) print(" <premiered>" + release + "</premiered>", file=code)
print(" <cover>" + cover + "</cover>", file=code) print(" <cover>" + cover + "</cover>", file=code)
if config.Config().is_trailer(): if config.getInstance().is_trailer():
print(" <trailer>" + trailer + "</trailer>", file=code) print(" <trailer>" + trailer + "</trailer>", file=code)
print(" <website>" + website + "</website>", file=code) print(" <website>" + website + "</website>", file=code)
print("</movie>", file=code) print("</movie>", file=code)
@@ -287,12 +309,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
except IOError as e: except IOError as e:
print("[-]Write Failed!") print("[-]Write Failed!")
print("[-]", e) print("[-]", e)
moveFailedFolder(filepath, conf) moveFailedFolder(filepath)
return return
except Exception as e1: except Exception as e1:
print("[-]Write Failed!") print("[-]Write Failed!")
print("[-]", e1) print("[-]", e1)
moveFailedFolder(filepath, conf) moveFailedFolder(filepath)
return return
@@ -321,7 +343,7 @@ def cutImage(imagecut, path, number, leak_word, c_word):
# leak 流出 参数值为 1 0 # leak 流出 参数值为 1 0
# uncensored 无码 参数值为 1 0 # uncensored 无码 参数值为 1 0
# ========================================================================加水印 # ========================================================================加水印
def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Config): def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored):
mark_type = '' mark_type = ''
if cn_sub: if cn_sub:
mark_type += ',字幕' mark_type += ',字幕'
@@ -331,17 +353,17 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Conf
mark_type += ',无码' mark_type += ',无码'
if mark_type == '': if mark_type == '':
return return
add_mark_thread(thumb_path, cn_sub, leak, uncensored, conf) add_mark_thread(thumb_path, cn_sub, leak, uncensored)
print('[+]Thumb Add Mark: ' + mark_type.strip(',')) print('[+]Thumb Add Mark: ' + mark_type.strip(','))
add_mark_thread(poster_path, cn_sub, leak, uncensored, conf) add_mark_thread(poster_path, cn_sub, leak, uncensored)
print('[+]Poster Add Mark: ' + mark_type.strip(',')) print('[+]Poster Add Mark: ' + mark_type.strip(','))
def add_mark_thread(pic_path, cn_sub, leak, uncensored, conf): def add_mark_thread(pic_path, cn_sub, leak, uncensored):
size = 14 size = 14
img_pic = Image.open(pic_path) img_pic = Image.open(pic_path)
# 获取自定义位置取余配合pos达到顺时针添加的效果 # 获取自定义位置取余配合pos达到顺时针添加的效果
# 左上 0, 右上 1, 右下 2 左下 3 # 左上 0, 右上 1, 右下 2 左下 3
count = conf.watermark_type() count = config.getInstance().watermark_type()
if cn_sub == 1 or cn_sub == '1': if cn_sub == 1 or cn_sub == '1':
add_to_pic(pic_path, img_pic, size, count, 1) # 添加 add_to_pic(pic_path, img_pic, size, count, 1) # 添加
count = (count + 1) % 4 count = (count + 1) % 4
@@ -391,29 +413,38 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
img_pic.save(pic_path, quality=95) img_pic.save(pic_path, quality=95)
# ========================结束================================= # ========================结束=================================
def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config.Config): # 文件路径,番号,后缀,要移动至的位置 def paste_file_to_folder(filepath, path, number, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置
filepath_obj = pathlib.Path(filepath) filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent) file_parent_origin_path = str(filepath_obj.parent)
try: try:
targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}") targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}")
# 任何情况下都不要覆盖以免遭遇数据源或者引擎错误导致所有文件得到同一个number逐一
# 同名覆盖致使全部文件损失且不可追回的最坏情况
if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.')
soft_link = config.getInstance().soft_link()
# 如果soft_link=1 使用软链接 # 如果soft_link=1 使用软链接
if conf.soft_link() == 0: if soft_link == 0:
shutil.move(filepath, targetpath) shutil.move(filepath, targetpath)
elif conf.soft_link() == 1: elif soft_link == 1:
# 采用相对路径,以便网络访问时能正确打开视频 # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持
# 相对路径径,改用绝对路径方式尝试建立软链接
try:
filerelpath = os.path.relpath(filepath, path) filerelpath = os.path.relpath(filepath, path)
os.symlink(filerelpath, targetpath) os.symlink(filerelpath, targetpath)
elif conf.soft_link() == 2: except:
os.symlink(filepath_obj.resolve(), targetpath)
elif soft_link == 2:
shutil.move(filepath, targetpath) shutil.move(filepath, targetpath)
# 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置 # 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置
# 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪 # 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪
# 便于手工找回文件。并将软连接文件名后缀修改,以避免再次被搜刮 # 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改
targetabspath = os.path.abspath(targetpath) targetabspath = os.path.abspath(targetpath)
if targetabspath != os.path.abspath(filepath): if targetabspath != os.path.abspath(filepath):
targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path) targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
os.symlink(targetrelpath, filepath + '#sym') os.symlink(targetrelpath, filepath)
sub_res = conf.sub_rule() sub_res = config.getInstance().sub_rule()
for subname in sub_res: for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname)) sub_filepath = str(filepath_obj.with_suffix(subname))
@@ -422,9 +453,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
print('[+]Sub moved!') print('[+]Sub moved!')
return True return True
except FileExistsError: except FileExistsError as fee:
print('[-]File Exists! Please check your movie!') print(f'[-]FileExistsError: {fee}')
print('[-]move to the root folder of the program.') moveFailedFolder(filepath)
return return
except PermissionError: except PermissionError:
print('[-]Error! Please run as administrator!') print('[-]Error! Please run as administrator!')
@@ -434,19 +465,22 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
return return
def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf): # 文件路径,番号,后缀,要移动至的位置 def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置
if multi_part == 1: if multi_part == 1:
number += part # 这时number会被附加上CD1后缀 number += part # 这时number会被附加上CD1后缀
filepath_obj = pathlib.Path(filepath) filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent) file_parent_origin_path = str(filepath_obj.parent)
targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")
if os.path.exists(targetpath):
raise FileExistsError('File Exists on destination path, we will never overwriting.')
try: try:
if conf.soft_link(): if config.getInstance().soft_link():
os.symlink(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")) os.symlink(filepath, targetpath)
else: else:
shutil.move(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")) shutil.move(filepath, targetpath)
sub_res = conf.sub_rule() sub_res = config.getInstance().sub_rule()
for subname in sub_res: for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname)) sub_filepath = str(filepath_obj.with_suffix(subname))
if os.path.isfile(sub_filepath): # 字幕移动 if os.path.isfile(sub_filepath): # 字幕移动
@@ -454,9 +488,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print('[+]Sub moved!') print('[+]Sub moved!')
print('[!]Success') print('[!]Success')
return True return True
except FileExistsError: except FileExistsError as fee:
print('[-]File Exists! Please check your movie!') print(f'[-]FileExistsError: {fee}')
print('[-]move to the root folder of the program.')
return return
except PermissionError: except PermissionError:
print('[-]Error! Please run as administrator!') print('[-]Error! Please run as administrator!')
@@ -465,7 +498,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print(f'[-]OS Error errno {oserr.errno}') print(f'[-]OS Error errno {oserr.errno}')
return return
def get_part(filepath, conf): def get_part(filepath):
try: try:
if re.search('-CD\d+', filepath): if re.search('-CD\d+', filepath):
return re.findall('-CD\d+', filepath)[0] return re.findall('-CD\d+', filepath)[0]
@@ -473,7 +506,7 @@ def get_part(filepath, conf):
return re.findall('-cd\d+', filepath)[0] return re.findall('-cd\d+', filepath)[0]
except: except:
print("[-]failed!Please rename the filename again!") print("[-]failed!Please rename the filename again!")
moveFailedFolder(filepath, conf) moveFailedFolder(filepath)
return return
@@ -493,7 +526,8 @@ def debug_print(data: json):
pass pass
def core_main(file_path, number_th, conf: config.Config): def core_main(file_path, number_th):
conf = config.getInstance()
# =======================================================================初始化所需变量 # =======================================================================初始化所需变量
multi_part = 0 multi_part = 0
part = '' part = ''
@@ -507,11 +541,11 @@ def core_main(file_path, number_th, conf: config.Config):
# 下面被注释的变量不需要 # 下面被注释的变量不需要
#rootpath= os.getcwd #rootpath= os.getcwd
number = number_th number = number_th
json_data = get_data_from_json(number, conf) # 定义番号 json_data = get_data_from_json(number) # 定义番号
# Return if blank dict returned (data not found) # Return if blank dict returned (data not found)
if not json_data: if not json_data:
moveFailedFolder(filepath, conf) moveFailedFolder(filepath)
return return
if json_data["number"] != number: if json_data["number"] != number:
@@ -526,16 +560,13 @@ def core_main(file_path, number_th, conf: config.Config):
# =======================================================================判断-C,-CD后缀 # =======================================================================判断-C,-CD后缀
if '-CD' in filepath or '-cd' in filepath: if '-CD' in filepath or '-cd' in filepath:
multi_part = 1 multi_part = 1
part = get_part(filepath, conf) part = get_part(filepath)
if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath: if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath:
cn_sub = '1' cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀 c_word = '-C' # 中文字幕影片后缀
# 判断是否无码 # 判断是否无码
if is_uncensored(number): uncensored = 1 if is_uncensored(number) else 0
uncensored = 1
else:
uncensored = 0
if '流出' in filepath or 'uncensored' in filepath: if '流出' in filepath or 'uncensored' in filepath:
@@ -550,7 +581,7 @@ def core_main(file_path, number_th, conf: config.Config):
debug_print(json_data) debug_print(json_data)
# 创建文件夹 # 创建文件夹
#path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data, conf) #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data)
# main_mode # main_mode
# 1: 刮削模式 / Scraping mode # 1: 刮削模式 / Scraping mode
@@ -558,54 +589,55 @@ def core_main(file_path, number_th, conf: config.Config):
# 3不改变路径刮削 # 3不改变路径刮削
if conf.main_mode() == 1: if conf.main_mode() == 1:
# 创建文件夹 # 创建文件夹
path = create_folder(json_data, conf) path = create_folder(json_data)
if multi_part == 1: if multi_part == 1:
number += part # 这时number会被附加上CD1后缀 number += part # 这时number会被附加上CD1后缀
# 检查小封面, 如果image cut为3则下载小封面 # 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3: if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath) small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
# creatFolder会返回番号路径 # creatFolder会返回番号路径
image_download( json_data.get('cover'), number, leak_word, c_word, path, conf, filepath) image_download( json_data.get('cover'), number, leak_word, c_word, path, filepath)
if not multi_part or part.lower() == '-cd1': if not multi_part or part.lower() == '-cd1':
try: try:
# 下载预告片 # 下载预告片
if conf.is_trailer() and json_data.get('trailer'): if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
except: except:
pass pass
try: try:
# 下载剧照 data, path, conf: config.Config, filepath # 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'): if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) extrafanart_download(json_data.get('extrafanart'), path, filepath)
except: except:
pass pass
# 裁剪图 # 裁剪图
cutImage(imagecut, path, number, leak_word, c_word) cutImage(imagecut, path, number, leak_word, c_word)
# 打印文件 # 添加水印
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf)
# 移动文件
paste_file_to_folder(filepath, path, number, leak_word, c_word, conf)
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark(): if conf.is_watermark():
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
# 移动电影
paste_file_to_folder(filepath, path, number, leak_word, c_word)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored)
elif conf.main_mode() == 2: elif conf.main_mode() == 2:
# 创建文件夹 # 创建文件夹
path = create_folder(json_data, conf) path = create_folder(json_data)
# 移动文件 # 移动文件
paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf) paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word)
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark(): if conf.is_watermark():
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
elif conf.main_mode() == 3: elif conf.main_mode() == 3:
path = str(Path(file_path).parent) path = str(Path(file_path).parent)
@@ -614,28 +646,29 @@ def core_main(file_path, number_th, conf: config.Config):
# 检查小封面, 如果image cut为3则下载小封面 # 检查小封面, 如果image cut为3则下载小封面
if imagecut == 3: if imagecut == 3:
small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath) small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
# creatFolder会返回番号路径 # creatFolder会返回番号路径
image_download(json_data.get('cover'), number, leak_word, c_word, path, conf, filepath) image_download(json_data.get('cover'), number, leak_word, c_word, path, filepath)
if not multi_part or part.lower() == '-cd1': if not multi_part or part.lower() == '-cd1':
# 下载预告片 # 下载预告片
if conf.is_trailer() and json_data.get('trailer'): if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
# 下载剧照 data, path, conf: config.Config, filepath # 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'): if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) extrafanart_download(json_data.get('extrafanart'), path, filepath)
# 裁剪图 # 裁剪图
cutImage(imagecut, path, number, leak_word, c_word) cutImage(imagecut, path, number, leak_word, c_word)
# 打印文件 # 添加水印
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
tag, json_data.get('actor_list'), liuchu, uncensored, conf)
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark(): if conf.is_watermark():
add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
# 最后输出.nfo元数据文件以完成.nfo文件创建作为任务成功标志
print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
tag, json_data.get('actor_list'), liuchu, uncensored)

View File

@@ -1,14 +1,14 @@
import os import os
import re import re
from core import * import sys
import config
G_spat = re.compile( G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@", "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
re.IGNORECASE) re.IGNORECASE)
def get_number(debug,filepath: str) -> str: def get_number(debug,file_path: str) -> str:
# """ # """
# >>> from number_parser import get_number # >>> from number_parser import get_number
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4") # >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4")
@@ -32,77 +32,174 @@ def get_number(debug,filepath: str) -> str:
# >>> get_number("snis-829-C.mp4") # >>> get_number("snis-829-C.mp4")
# 'snis-829' # 'snis-829'
# """ # """
filepath = os.path.basename(filepath) filepath = os.path.basename(file_path)
# debug True 和 False 两块代码块合并原因是此模块及函数只涉及字符串计算没有IO操作debug on时输出导致异常信息即可
if debug == False:
try: try:
if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 file_number = get_number_by_dict(filepath)
#filepath = filepath.replace("_", "-") if file_number:
return file_number
elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
filepath = G_spat.sub("", filepath) filepath = G_spat.sub("", filepath)
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
lower_check = filename.lower() lower_check = filename.lower()
if 'fc2' in lower_check: if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
file_number = get_number_by_dict(lower_check)
if file_number:
return file_number
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
else: # 提取不含减号-的番号FANZA CID else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则 # 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
if oumei: if oumei:
return oumei.group() return oumei.group()
try: try:
return str( return str(
re.findall(r'(.+?)\.', re.findall(r'(.+?)\.',
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
"['']").replace('_', '-') "['']").replace('_', '-')
except: except:
return re.search(r'(.+?)\.', filepath)[0] return str(re.search(r'(.+?)\.', filepath)[0])
except Exception as e: except Exception as e:
print('[-]' + str(e)) if debug:
return print(f'[-]Number Parser exception: {e} [{file_path}]')
elif debug == True:
if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
#filepath = filepath.replace("_", "-")
filepath = G_spat.sub("", filepath)
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
file_number = get_number_by_dict(lower_check)
if file_number:
return file_number
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
if oumei:
return oumei.group()
try:
return str(
re.findall(r'(.+?)\.',
str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
"['']").replace('_', '-')
except:
return re.search(r'(.+?)\.', filepath)[0]
G_TAKE_NUM_RULES = {
'tokyo' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.A).group()),
'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('_', '-'),
'1pon' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('-', '_'),
'10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.A).group()).replace('-', '_'),
'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.A).group())
}
def get_number_by_dict(lower_filename: str) -> str:
for k,v in G_TAKE_NUM_RULES.items():
if k in lower_filename:
return v(lower_filename)
return None return None
# if __name__ == "__main__":
# 按javdb数据源的命名规范提取number
G_TAKE_NUM_RULES = {
'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
'1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
'10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0])
}
def get_number_by_dict(filename: str) -> str:
try:
for k,v in G_TAKE_NUM_RULES.items():
if re.search(k, filename, re.I):
return v(filename)
except:
pass
return None
class Cache_uncensored_conf:
prefix = None
def is_empty(self):
return bool(self.prefix is None)
def set(self, v: list):
if not v or not len(v) or not len(v[0]):
raise ValueError('input prefix list empty or None')
s = v[0]
if len(v) > 1:
for i in v[1:]:
s += f"|{i}.+"
self.prefix = re.compile(s, re.I)
def check(self, number):
if self.prefix is None:
raise ValueError('No init re compile')
return self.prefix.match(number)
G_cache_uncensored_conf = Cache_uncensored_conf()
# ========================================================================是否为无码
def is_uncensored(number):
if re.match(
r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
number,
re.I
):
return True
if G_cache_uncensored_conf.is_empty():
G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
return G_cache_uncensored_conf.check(number)
if __name__ == "__main__":
# import doctest # import doctest
# doctest.testmod(raise_on_error=True) # doctest.testmod(raise_on_error=True)
test_use_cases = (
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
"TokyoHot-n1287-HD SP2006 .mp4",
"caribean-020317_001.nfo", # -号误命名为_号的
"257138_3xplanet_1Pondo_080521_001.mp4",
"ADV-R0624-CD3.wmv", # 多碟影片
"XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源
"xxx-av 20589.mp4",
"Muramura-102114_145-HD.wmv", # 新支持片商格式 102114_145 命名规则来自javdb数据源
"heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
"HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
"pacopacomama-093021_539-FHD.mkv" # 新支持片商格式 093021_539 命名规则来自javdb数据源
)
def evprint(evstr):
code = compile(evstr, "<string>", "eval")
print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
for t in test_use_cases:
evprint(f'get_number(True, "{t}")')
if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
sys.exit(0)
# 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据参数为盘符 A .. Z 或带盘符路径
# https://www.voidtools.com/support/everything/command_line_interface/
# ES命令行工具需要Everything文件搜索引擎处于运行状态es.exe单个执行文件需放入PATH路径中。
# Everything是免费软件
# 示例:
# python.exe .\number_parser.py ALL # 从所有磁盘搜索视频
# python.exe .\number_parser.py D # 从D盘搜索
# python.exe .\number_parser.py D: # 同上
# python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录路径必须带盘符
# ==================
# Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据
# 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引
# MAC OS X 使用findutils的glocate需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引
# 示例:
# python3 ./number_parser.py ALL
import subprocess
ES_search_path = "ALL disks"
if sys.argv[1] == "ALL":
if sys.platform == "win32":
# ES_prog_path = 'C:/greensoft/es/es.exe'
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v'
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030此编码为UNICODE方言与UTF-8系全射关系无转码损失
out_list = out_text.splitlines()
elif sys.platform in ("linux", "darwin"):
ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path)
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('utf-8')
out_list = [ os.path.basename(line) for line in out_text.splitlines()]
else:
print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.')
sys.exit(1)
else: # Windows single disk
if sys.platform != "win32":
print('[!]Usage: python3 ./number_parser.py ALL')
sys.exit(0)
# ES_prog_path = 'C:/greensoft/es/es.exe'
ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
if os.path.isdir(sys.argv[1]):
ES_search_path = sys.argv[1]
else:
ES_search_path = sys.argv[1][0] + ':/'
if not os.path.isdir(ES_search_path):
ES_search_path = 'C:/'
ES_search_path = os.path.normcase(ES_search_path)
ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v'
out_bytes = subprocess.check_output(ES_cmdline.split(' '))
out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030此编码为UNICODE方言与UTF-8系全射关系无转码损失
out_list = out_text.splitlines()
print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...')
print(f'[+]Find {len(out_list)} Movies.')
for filename in out_list:
try:
n = get_number(True, filename)
if n:
print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else ''))
else:
print(f'[-]Number return None. # {filename}')
except Exception as e:
print(f'[-]Number Parser exception: {e} [{filename}]')
sys.exit(0)

View File

@@ -10,7 +10,8 @@ pyinstaller --onefile AV_Data_Capture.py `
--hidden-import ADC_function.py ` --hidden-import ADC_function.py `
--hidden-import core.py ` --hidden-import core.py `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` --add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
--add-data "Img;Img" --add-data "Img;Img" `
--add-data "config.ini;." `
rmdir -Recurse -Force build rmdir -Recurse -Force build
rmdir -Recurse -Force __pycache__ rmdir -Recurse -Force __pycache__

View File

@@ -1,4 +1,8 @@
pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448 pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448
pip install pyquery pyinstaller pip install pyquery pyinstaller
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img" pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "Img:Img" \
--add-data "config.ini:." \
cp config.ini ./dist cp config.ini ./dist

View File

@@ -12,5 +12,9 @@
#fi #fi
pip3 install -r requirements.txt pip3 install -r requirements.txt
pip3 install cloudscraper==1.2.52 pip3 install cloudscraper==1.2.52
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img" pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "Img:Img" \
--add-data "config.ini:." \
cp config.ini ./dist cp config.ini ./dist