This commit is contained in:
unknown
2021-11-10 13:40:11 +08:00
8 changed files with 14791 additions and 157 deletions

63
ADC_function.py Executable file → Normal file
View File

@@ -15,6 +15,7 @@ import mechanicalsoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from cloudscraper import create_scraper
from concurrent.futures import ThreadPoolExecutor
def getXpathSingle(htmlcode, xpath):
@@ -136,9 +137,9 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur
return None
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, use_scraper: bool = False):
configProxy = config.getInstance().proxy()
s = requests.Session()
s = create_scraper(browser={'custom': ua or G_USER_AGENT,}) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
@@ -238,9 +239,9 @@ def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, re
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
print("[-]get_html_by_scraper() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
print(f"[-]get_html_by_scraper() failed. {e}")
return None
@@ -298,27 +299,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
translate_list = [i["trans"] for i in result.json()["sentences"]]
trans_result = trans_result.join(translate_list)
# elif engine == "baidu":
# url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
# salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288)
# sign = app_id + src + str(salt) + key
# sign = hashlib.md5(sign.encode()).hexdigest()
# url += (
# "?appid="
# + app_id
# + "&q="
# + src
# + "&from=auto&to="
# + target_language
# + "&salt="
# + str(salt)
# + "&sign="
# + sign
# )
# result = get_html(url=url, return_type="object")
#
# translate_list = [i["dst"] for i in result.json()["trans_result"]]
# trans_result = trans_result.join(translate_list)
elif engine == "azure":
url = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&to=" + target_language
headers = {
@@ -490,9 +470,40 @@ def download_file_with_filename(url, filename, path):
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
return
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
'''用法示例: 2线程同时下载两个不同文件并保存到不同路径路径目录可未创建但需要具备对目标目录和文件的写权限
parallel_download_files([
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
])
'''
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
# parallel: 并行下载的线程池线程数为0则由函数自己决定
def parallel_download_files(dn_list, parallel: int = 0):
mp_args = []
for url, fullpath in dn_list:
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
fullpath = Path(fullpath)
fullpath.parent.mkdir(parents=True, exist_ok=True)
mp_args.append((url, fullpath))
if not len(mp_args):
return []
if not isinstance(parallel, int) or parallel not in range(1,200):
parallel = min(5, len(mp_args))
with ThreadPoolExecutor(parallel) as pool:
results = list(pool.map(download_one_file, mp_args))
return results
def delete_all_elements_in_list(string,lists):
new_lists = []
for i in lists:
if i != string:
new_lists.append(i)
return new_lists
return new_lists

View File

@@ -14,7 +14,7 @@ import config
from datetime import datetime, timedelta
import time
from pathlib import Path
from ADC_function import file_modification_days, get_html
from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number
from core import core_main, moveFailedFolder
@@ -473,18 +473,24 @@ def main():
if conf.update_check():
check_update(version)
# Download Mapping Table
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')):
ADC_function.download_file_with_filename(
# Download Mapping Table, parallel version
down_map_tab = []
actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
if not actor_xml.exists():
down_map_tab.append((
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
"mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
print("[+] [1/2] Mapping Table Downloaded")
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')):
ADC_function.download_file_with_filename(
actor_xml))
info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
if not info_xml.exists():
down_map_tab.append((
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
"mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
print("[+] [2/2] Mapping Table Downloaded")
info_xml))
res = parallel_download_files(down_map_tab)
for i, fp in enumerate(res, start=1):
if fp and len(fp):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
else:
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug():

14598
MappingTable/c_number.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -3,6 +3,9 @@
keyword用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。
zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。当输出词为“删除”时表示遇到该关键词时在对应内容中删除该关键词-->
<info>
<!-- ======================== -->
<!-- 删除 -->
<!-- ======================== -->
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",成人奖,"/>
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",觸摸打字,触摸打字,"/>
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",10枚組,"/>
@@ -46,6 +49,9 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",亞洲,亚洲,"/>
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",ハロウィーンキャンペーン,"/>
<!-- ======================== -->
<!-- 标签 -->
<!-- ======================== -->
<a zh_cn="16小时+" zh_tw="16小時+" jp="16時間以上作品" keyword=",16小時以上作品,16小时以上作品,16時間以上作品,16小时+,16小時+,"/>
<a zh_cn="3D" zh_tw="3D" jp="3D" keyword=",3D,"/>
<a zh_cn="3D卡通" zh_tw="3D卡通" jp="3Dエロアニメ" keyword=",3D卡通,3Dエロアニメ,"/>
@@ -377,12 +383,16 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="两男一女" zh_tw="兩男一女" jp="兩男一女" keyword=",兩男一女,两男一女,"/>
<a zh_cn="3P" zh_tw="3P" jp="3P" keyword=",3P,3p,P,p,"/>
<a zh_cn="唾液敷面" zh_tw="唾液敷面" jp="唾液敷面" keyword=",唾液敷面,"/>
<!-- ======================== -->
<!-- 厂商 -->
<!-- ======================== -->
<a zh_cn="kira☆kira" zh_tw="kira☆kira" jp="kira☆kira" keyword=",kira☆kira,"/>
<a zh_cn="S1 NO.1 STYLE" zh_tw="S1 NO.1 STYLE" jp="S1 NO.1 STYLE" keyword=",S1 Style,エスワン,エスワンナンバーワンスタイル,S1 NO.1 STYLE,S1NO.1STYLE,"/>
<a zh_cn="S1 NO.1 STYLE" zh_tw="S1 NO.1 STYLE" jp="S1 NO.1 STYLE" keyword=",S1 Style,エスワン,エスワン ナンバーワンスタイル,エスワンナンバーワンスタイル,S1 NO.1 STYLE,S1NO.1STYLE,"/>
<a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/>
<a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/>
<a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/>
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,"/>
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,サディスティックヴィレッジ,"/>
<a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/>
<a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/>
<a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/>
@@ -407,5 +417,28 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/>
<a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/>
<!-- 2021-11-8 Update -->
<a zh_cn="Energy" zh_tw="Energy" jp="アイエナジー" keyword=",アイエナジー,"/>
<a zh_cn="Idea Pocket" zh_tw="Idea Pocket" jp="アイデアポケット" keyword=",アイデアポケット,"/>
<a zh_cn="AKNR" zh_tw="AKNR" jp="アキノリ" keyword=",アキノリ,"/>
<a zh_cn="Attackers" zh_tw="Attackers" jp="アタッカーズ" keyword=",アタッカーズ,"/>
<a zh_cn="Alice Japan" zh_tw="Alice Japan" jp="アリスJAPAN" keyword=",アリスJAPAN,"/>
<a zh_cn="Aurora Project Annex" zh_tw="Aurora Project Annex" jp="オーロラプロジェクト・アネックス" keyword=",オーロラプロジェクト・アネックス,"/>
<a zh_cn="Crystal 映像" zh_tw="Crystal 映像" jp="クリスタル映像" keyword=",クリスタル映像,"/>
<a zh_cn="Glory Quest" zh_tw="Glory Quest" jp="グローリークエスト" keyword=",グローリークエスト,"/>
<a zh_cn="DAS" zh_tw="DAS" jp="ダスッ!" keyword=",ダスッ!,"/>
<a zh_cn="DEEPs" zh_tw="DEEPs" jp="ディープス" keyword=",ディープス,"/>
<a zh_cn="Dogma" zh_tw="Dogma" jp="ドグマ" keyword=",ドグマ,"/>
<a zh_cn="宇宙企画" zh_tw="宇宙企画" jp="メディアステーション" keyword=",メディアステーション,"/>
<a zh_cn="WANZ FACTORY" zh_tw="WANZ FACTORY" jp="ワンズファクトリー" keyword=",ワンズファクトリー,"/>
<a zh_cn="VR PRODUCE" zh_tw="VR PRODUCE" jp="VRプロダクツ" keyword=",VRプロダクツ,VRPRODUCE,"/>
<a zh_cn="Real Works" zh_tw="Real Works" jp="レアルワークス" keyword=",レアルワークス,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="マックスエー" keyword=",マックスエー,"/>
<a zh_cn="PETERS MAX" zh_tw="PETERS MAX" jp="ピーターズMAX" keyword=",ピーターズMAX,"/>
<a zh_cn="NATURAL HIGH" zh_tw="NATURAL HIGH" jp="ナチュラルハイ" keyword=",ナチュラルハイ,"/>
<a zh_cn="MAXING" zh_tw="MAXING" jp="マキシング" keyword=",マキシング,"/>
<a zh_cn="Ms Video Group" zh_tw="Ms Video Group" jp="エムズビデオグループ" keyword=",エムズビデオグループ,"/>
<a zh_cn="Minimum" zh_tw="Minimum" jp="ミニマム" keyword=",ミニマム,"/>
<a zh_cn="WAAP Entertainment" zh_tw="WAAP Entertainment" jp="ワープエンタテインメント" keyword=",ワープエンタテインメント,"/>
<a zh_cn="pacopacomama" zh_tw="pacopacomama" jp="パコパコママ" keyword=",pacopacomama,パコパコママ,"/>
</info>

View File

@@ -214,45 +214,6 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# === 替换Studio片假名
studio = studio.replace('アイエナジー','Energy')
studio = studio.replace('アイデアポケット','Idea Pocket')
studio = studio.replace('アキノリ','AKNR')
studio = studio.replace('アタッカーズ','Attackers')
studio = re.sub('アパッチ.*','Apache',studio)
studio = studio.replace('アマチュアインディーズ','SOD')
studio = studio.replace('アリスJAPAN','Alice Japan')
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
studio = studio.replace('クリスタル映像','Crystal 映像')
studio = studio.replace('グローリークエスト','Glory Quest')
studio = studio.replace('ダスッ!','DAS')
studio = studio.replace('ディープス','DEEPs')
studio = studio.replace('ドグマ','Dogma')
studio = studio.replace('プレステージ','PRESTIGE')
studio = studio.replace('ムーディーズ','MOODYZ')
studio = studio.replace('メディアステーション','宇宙企画')
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
studio = studio.replace('エスワンナンバーワンスタイル','S1')
studio = studio.replace('SODクリエイト','SOD')
studio = studio.replace('サディスティックヴィレッジ','SOD')
studio = studio.replace('VRプロダクツ','VR PRODUCE')
studio = studio.replace('VRPRODUCE','VR PRODUCE')
studio = studio.replace('レアルワークス','Real Works')
studio = studio.replace('マックスエー','MAX-A')
studio = studio.replace('ピーターズMAX','PETERS MAX')
studio = studio.replace('プレミアム','PREMIUM')
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
studio = studio.replace('マキシング','MAXING')
studio = studio.replace('エムズビデオグループ','Ms Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
studio = studio.replace('パコパコママ','pacopacomama')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
# 返回处理后的json_data
json_data['title'] = title
json_data['original_title'] = title
@@ -275,16 +236,14 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
t = ""
# if conf.get_transalte_engine() == "baidu":
# json_data[translate_value] = translate(
# json_data[translate_value],
# target_language="zh",
# engine=conf.get_transalte_engine(),
# app_id=conf.get_transalte_appId(),
# key=conf.get_transalte_key(),
# delay=conf.get_transalte_delay(),
# )
if translate_value == "title":
title_dict = json.load(
open(str(Path.home() / '.local' / 'share' / 'avdc' / 'c_number.json'), 'r', encoding="utf-8"))
try:
json_data[translate_value] = title_dict[number]
continue
except:
pass
if conf.get_transalte_engine() == "azure":
t = translate(
json_data[translate_value],
@@ -309,9 +268,9 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
elif ccm == 2:
json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=aa)[0] for aa in json_data['actor_list']]
json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data['actor'])[0]
# elif ccm == 3:
# json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=aa)[0] for aa in json_data['actor_list']]
# json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data['actor'])[0]
elif ccm == 3:
json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=aa)[0] for aa in json_data['actor_list']]
json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data['actor'])[0]
except:
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
json_data['actor'] = oCC.convert(json_data['actor'])
@@ -323,16 +282,23 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
elif ccm == 2:
json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=t)[0] for t in json_data[cc]]
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
# elif ccm == 3:
# json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=t)[0] for t in json_data[cc]]
# #json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=t)[0] for t in json_data[cc]]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
except:
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
# elif cc == "studio":
# elif cc == "series":
# elif cc == "label":
else:
try:
if ccm == 1:
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_cn', name=json_data[cc])[0]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data[cc])[0]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data[cc])[0]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
except IndexError:
json_data[cc] = oCC.convert(json_data[cc])
except:
pass

View File

@@ -53,25 +53,28 @@ def getStoryline(number, title, sites: list=None):
assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args)
sel = ''
if not debug and conf.storyline_show() == 0:
for value in results:
if isinstance(value, str) and len(value):
return value
return ''
if not is_japanese(value):
return value
if not len(sel):
sel = value
return sel
# 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
first = True
sel = ''
sel_site = ''
for site, desc in zip(apply_sites, results):
sl = len(desc) if isinstance(desc, str) else 0
if sl and first:
s += f'[选中{site}字数:{sl}]'
first = False
sel = desc
elif sl:
s += f'{site}字数:{sl}'
else:
s += f'{site}:空'
if not is_japanese(desc):
sel_site, sel = site, desc
break
if sl and not len(sel_site):
sel_site, sel = site, desc
for site, desc in zip(apply_sites, results):
sl = len(desc) if isinstance(desc, str) else 0
s += f'[选中{site}字数:{sl}]' if site == sel_site else f'{site}字数:{sl}' if sl else f'{site}:空'
print(s)
return sel
@@ -144,36 +147,36 @@ def getStoryline_airav(number, debug):
def getStoryline_airavwiki(number, debug):
try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search={kwd}'
result, session = get_html_session(url, return_type='session')
if not result:
raise ValueError(f"get_html_session('{url}','{number}') failed")
j = json.loads(result.content)
if int(j.get('count')) == 0:
raise ValueError("number not found")
url = f'https://cn.airav.wiki/?search={kwd}'
result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True)
if not result.ok:
raise ValueError(f"get_html_by_browser('{url}','{number}') failed")
s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block')
link = None
for r in j["result"]:
n = r['barcode']
if re.search(number, n, re.I):
link = urljoin(result.url, f'/api/video/barcode/{n}?lng=zh-TW')
for a in s:
title = a.img['title']
list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
if re.match(f'^{number}$', list_number, re.I):
link = a
break
elif re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = session.get(link)
if not result.ok or not re.search(number, result.url, re.I):
result = browser.follow_link(link)
if not result.ok or not re.search(number, browser.url, re.I):
raise ValueError("detail page not found")
j = json.loads(result.content)
if int(j.get('count')) != 1:
raise ValueError("number not found")
detail_number = j["result"]['barcode']
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]")
desc = j["result"]['description']
desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].")
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
pass
return ''
@@ -196,7 +199,8 @@ def getStoryline_58avgo(number, debug):
link = None
for a in s:
title = a.h3.text.strip()
if re.search(number, title, re.I):
list_number = title[title.rfind(' ')+1:].strip()
if re.search(number, list_number, re.I):
link = a
break
if link is None:
@@ -227,9 +231,13 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs):
raise ValueError(f"number not found")
partial_num = bool(re.match(r'\d{6}[\-_]\d{2,3}', number))
for title, desc in zip(titles, descs):
page_number = title[title.rfind(' '):].strip()
if re.search(number, page_number, re.I):
page_number = title[title.rfind(' ')+1:].strip()
if not partial_num:
if re.match(f'^{number}$', page_number, re.I):
return desc.strip()
elif re.search(number, page_number, re.I):
return desc.strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
@@ -254,7 +262,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
s = browser.page.select('div.type_movie > div > ul > li > div')
for div in s:
title = div.a.h3.text.strip()
page_number = title[title.rfind(' '):].strip()
page_number = title[title.rfind(' ')+1:].strip()
if re.search(number, page_number, re.I):
return div['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match")

42
core.py Executable file → Normal file
View File

@@ -9,7 +9,6 @@ from PIL import Image
from io import BytesIO
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from ADC_function import *
from WebCrawler import get_data_from_json
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
if conf.debug():
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
def extrafanart_download_threadpool(url_list, save_dir, number):
tm_start = time.perf_counter()
conf = config.getInstance()
extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
download_only_missing_images = conf.download_only_missing_images()
mp_args = []
dn_list = []
for i, url in enumerate(url_list, start=1):
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
continue
mp_args.append((url, jpg_fullpath))
if not len(mp_args):
dn_list.append((url, jpg_fullpath))
if not len(dn_list):
return
extrafanart_dir.mkdir(parents=True, exist_ok=True)
parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
if parallel > 100:
print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
with ThreadPoolExecutor(parallel) as pool:
result = list(pool.map(download_one_file, mp_args))
result = parallel_download_files(dn_list, parallel)
failed = 0
for i, r in enumerate(result, start=1):
if not r:
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
if conf.debug():
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
# 封面是否下载成功否则移动到failed
def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg"
@@ -299,7 +290,11 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
with open(nfo_path, "wt", encoding='UTF-8') as code:
print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
print("<movie>", file=code)
print(" <title>" + naming_rule + "</title>", file=code)
print(" <title>" + naming_rule + "</title>", file=code)
print(" <originaltitle>" + naming_rule + "</originaltitle>", file=code)
print(" <sorttitle>" + naming_rule + "</sorttitle>", file=code)
print(" <customrating>JP-18+</customrating>", file=code)
print(" <mpaa>JP-18+</mpaa>", file=code)
print(" <set>", file=code)
print(" </set>", file=code)
print(" <studio>" + studio + "</studio>", file=code)
@@ -314,7 +309,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
try:
for key in actor_list:
print(" <actor>", file=code)
print(" <name>" + key + "</name>", file=code)
print(" <name>" + key + "</name>", file=code)
print(" </actor>", file=code)
except:
aaaa = ''
@@ -346,6 +341,8 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
aaaaaaaa = ''
print(" <num>" + number + "</num>", file=code)
print(" <premiered>" + release + "</premiered>", file=code)
print(" <releasedate>" + release + "</releasedate>", file=code)
print(" <release>" + release + "</release>", file=code)
print(" <cover>" + cover + "</cover>", file=code)
if config.getInstance().is_trailer():
print(" <trailer>" + trailer + "</trailer>", file=code)
@@ -564,16 +561,19 @@ def get_part(filepath):
def debug_print(data: json):
try:
print("[+] ---Debug info---")
print("[+] ------- DEBUG INFO -------")
for i, v in data.items():
if i == 'outline':
print('[+] -', i, ' :', len(v), 'characters')
print('[+] -', "%-14s" % i, ':', len(v), 'characters')
continue
if i == 'actor_photo' or i == 'year':
continue
print('[+] -', "%-11s" % i, ':', v)
if i == 'extrafanart':
print('[+] -', "%-14s" % i, ':', len(v), 'links')
continue
print('[+] -', "%-14s" % i, ':', v)
print("[+] ---Debug info---")
print("[+] ------- DEBUG INFO -------")
except:
pass

View File

@@ -4,7 +4,7 @@ import sys
import config
G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@|-uncensored|_uncensored|-leak|_leak",
re.IGNORECASE)
@@ -44,7 +44,10 @@ def get_number(debug,file_path: str) -> str:
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
filename = re.sub("(-|_)cd\d{1,2}", "", filename, flags=re.IGNORECASE)
file_number = str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE)
return file_number.upper()
else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
@@ -119,6 +122,15 @@ if __name__ == "__main__":
# import doctest
# doctest.testmod(raise_on_error=True)
test_use_cases = (
"MEYD-594-C.mp4",
"SSIS-001_C.mp4",
"SSIS100-C.mp4",
"SSIS101_C.mp4",
"ssni984.mp4",
"ssni666.mp4",
"SDDE-625_uncensored_C.mp4",
"SDDE-625_uncensored_leak_C.mp4",
"SDDE-625_uncensored_leak_C_cd1.mp4",
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
"TokyoHot-n1287-HD SP2006 .mp4",
"caribean-020317_001.nfo", # -号误命名为_号的