This commit is contained in:
unknown
2021-11-10 13:40:11 +08:00
8 changed files with 14791 additions and 157 deletions

63
ADC_function.py Executable file → Normal file
View File

@@ -15,6 +15,7 @@ import mechanicalsoup
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from cloudscraper import create_scraper from cloudscraper import create_scraper
from concurrent.futures import ThreadPoolExecutor
def getXpathSingle(htmlcode, xpath): def getXpathSingle(htmlcode, xpath):
@@ -136,9 +137,9 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur
return None return None
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, use_scraper: bool = False):
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
s = requests.Session() s = create_scraper(browser={'custom': ua or G_USER_AGENT,}) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies): if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(s.cookies, cookies) requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
@@ -238,9 +239,9 @@ def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, re
result.encoding = encoding or "utf-8" result.encoding = encoding or "utf-8"
return result.text return result.text
except requests.exceptions.ProxyError: except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy") print("[-]get_html_by_scraper() Proxy error! Please check your Proxy")
except Exception as e: except Exception as e:
print(f"[-]get_html_session() failed. {e}") print(f"[-]get_html_by_scraper() failed. {e}")
return None return None
@@ -298,27 +299,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
translate_list = [i["trans"] for i in result.json()["sentences"]] translate_list = [i["trans"] for i in result.json()["sentences"]]
trans_result = trans_result.join(translate_list) trans_result = trans_result.join(translate_list)
# elif engine == "baidu":
# url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
# salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288)
# sign = app_id + src + str(salt) + key
# sign = hashlib.md5(sign.encode()).hexdigest()
# url += (
# "?appid="
# + app_id
# + "&q="
# + src
# + "&from=auto&to="
# + target_language
# + "&salt="
# + str(salt)
# + "&sign="
# + sign
# )
# result = get_html(url=url, return_type="object")
#
# translate_list = [i["dst"] for i in result.json()["trans_result"]]
# trans_result = trans_result.join(translate_list)
elif engine == "azure": elif engine == "azure":
url = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&to=" + target_language url = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&to=" + target_language
headers = { headers = {
@@ -490,9 +470,40 @@ def download_file_with_filename(url, filename, path):
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!') raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
return return
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
'''用法示例: 2线程同时下载两个不同文件并保存到不同路径路径目录可未创建但需要具备对目标目录和文件的写权限
parallel_download_files([
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
])
'''
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
# parallel: 并行下载的线程池线程数为0则由函数自己决定
def parallel_download_files(dn_list, parallel: int = 0):
mp_args = []
for url, fullpath in dn_list:
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
fullpath = Path(fullpath)
fullpath.parent.mkdir(parents=True, exist_ok=True)
mp_args.append((url, fullpath))
if not len(mp_args):
return []
if not isinstance(parallel, int) or parallel not in range(1,200):
parallel = min(5, len(mp_args))
with ThreadPoolExecutor(parallel) as pool:
results = list(pool.map(download_one_file, mp_args))
return results
def delete_all_elements_in_list(string,lists): def delete_all_elements_in_list(string,lists):
new_lists = [] new_lists = []
for i in lists: for i in lists:
if i != string: if i != string:
new_lists.append(i) new_lists.append(i)
return new_lists return new_lists

View File

@@ -14,7 +14,7 @@ import config
from datetime import datetime, timedelta from datetime import datetime, timedelta
import time import time
from pathlib import Path from pathlib import Path
from ADC_function import file_modification_days, get_html from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number from number_parser import get_number
from core import core_main, moveFailedFolder from core import core_main, moveFailedFolder
@@ -473,18 +473,24 @@ def main():
if conf.update_check(): if conf.update_check():
check_update(version) check_update(version)
# Download Mapping Table # Download Mapping Table, parallel version
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')): down_map_tab = []
ADC_function.download_file_with_filename( actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
if not actor_xml.exists():
down_map_tab.append((
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml", "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
"mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc')) actor_xml))
print("[+] [1/2] Mapping Table Downloaded") info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
if not info_xml.exists():
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')): down_map_tab.append((
ADC_function.download_file_with_filename(
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml", "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
"mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc')) info_xml))
print("[+] [2/2] Mapping Table Downloaded") res = parallel_download_files(down_map_tab)
for i, fp in enumerate(res, start=1):
if fp and len(fp):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
else:
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
print(f"[+]Load Config file '{conf.ini_path}'.") print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug(): if conf.debug():

14598
MappingTable/c_number.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -3,6 +3,9 @@
keyword用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。 keyword用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。
zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。当输出词为“删除”时表示遇到该关键词时在对应内容中删除该关键词--> zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。当输出词为“删除”时表示遇到该关键词时在对应内容中删除该关键词-->
<info> <info>
<!-- ======================== -->
<!-- 删除 -->
<!-- ======================== -->
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",成人奖,"/> <a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",成人奖,"/>
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",觸摸打字,触摸打字,"/> <a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",觸摸打字,触摸打字,"/>
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",10枚組,"/> <a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",10枚組,"/>
@@ -46,6 +49,9 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",亞洲,亚洲,"/> <a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",亞洲,亚洲,"/>
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",ハロウィーンキャンペーン,"/> <a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",ハロウィーンキャンペーン,"/>
<!-- ======================== -->
<!-- 标签 -->
<!-- ======================== -->
<a zh_cn="16小时+" zh_tw="16小時+" jp="16時間以上作品" keyword=",16小時以上作品,16小时以上作品,16時間以上作品,16小时+,16小時+,"/> <a zh_cn="16小时+" zh_tw="16小時+" jp="16時間以上作品" keyword=",16小時以上作品,16小时以上作品,16時間以上作品,16小时+,16小時+,"/>
<a zh_cn="3D" zh_tw="3D" jp="3D" keyword=",3D,"/> <a zh_cn="3D" zh_tw="3D" jp="3D" keyword=",3D,"/>
<a zh_cn="3D卡通" zh_tw="3D卡通" jp="3Dエロアニメ" keyword=",3D卡通,3Dエロアニメ,"/> <a zh_cn="3D卡通" zh_tw="3D卡通" jp="3Dエロアニメ" keyword=",3D卡通,3Dエロアニメ,"/>
@@ -377,12 +383,16 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="两男一女" zh_tw="兩男一女" jp="兩男一女" keyword=",兩男一女,两男一女,"/> <a zh_cn="两男一女" zh_tw="兩男一女" jp="兩男一女" keyword=",兩男一女,两男一女,"/>
<a zh_cn="3P" zh_tw="3P" jp="3P" keyword=",3P,3p,P,p,"/> <a zh_cn="3P" zh_tw="3P" jp="3P" keyword=",3P,3p,P,p,"/>
<a zh_cn="唾液敷面" zh_tw="唾液敷面" jp="唾液敷面" keyword=",唾液敷面,"/> <a zh_cn="唾液敷面" zh_tw="唾液敷面" jp="唾液敷面" keyword=",唾液敷面,"/>
<!-- ======================== -->
<!-- 厂商 -->
<!-- ======================== -->
<a zh_cn="kira☆kira" zh_tw="kira☆kira" jp="kira☆kira" keyword=",kira☆kira,"/> <a zh_cn="kira☆kira" zh_tw="kira☆kira" jp="kira☆kira" keyword=",kira☆kira,"/>
<a zh_cn="S1 NO.1 STYLE" zh_tw="S1 NO.1 STYLE" jp="S1 NO.1 STYLE" keyword=",S1 Style,エスワン,エスワンナンバーワンスタイル,S1 NO.1 STYLE,S1NO.1STYLE,"/> <a zh_cn="S1 NO.1 STYLE" zh_tw="S1 NO.1 STYLE" jp="S1 NO.1 STYLE" keyword=",S1 Style,エスワン,エスワン ナンバーワンスタイル,エスワンナンバーワンスタイル,S1 NO.1 STYLE,S1NO.1STYLE,"/>
<a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/> <a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/>
<a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/> <a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/>
<a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/> <a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/>
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,"/> <a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,サディスティックヴィレッジ,"/>
<a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/> <a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/>
<a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/> <a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/>
<a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/> <a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/>
@@ -407,5 +417,28 @@ zh_cn/zh_tw/jp指对应语言输出的词按设置的对应语言输出。
<a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/> <a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/>
<a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/> <a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/> <a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/>
<!-- 2021-11-8 Update -->
<a zh_cn="Energy" zh_tw="Energy" jp="アイエナジー" keyword=",アイエナジー,"/>
<a zh_cn="Idea Pocket" zh_tw="Idea Pocket" jp="アイデアポケット" keyword=",アイデアポケット,"/>
<a zh_cn="AKNR" zh_tw="AKNR" jp="アキノリ" keyword=",アキノリ,"/>
<a zh_cn="Attackers" zh_tw="Attackers" jp="アタッカーズ" keyword=",アタッカーズ,"/>
<a zh_cn="Alice Japan" zh_tw="Alice Japan" jp="アリスJAPAN" keyword=",アリスJAPAN,"/>
<a zh_cn="Aurora Project Annex" zh_tw="Aurora Project Annex" jp="オーロラプロジェクト・アネックス" keyword=",オーロラプロジェクト・アネックス,"/>
<a zh_cn="Crystal 映像" zh_tw="Crystal 映像" jp="クリスタル映像" keyword=",クリスタル映像,"/>
<a zh_cn="Glory Quest" zh_tw="Glory Quest" jp="グローリークエスト" keyword=",グローリークエスト,"/>
<a zh_cn="DAS" zh_tw="DAS" jp="ダスッ!" keyword=",ダスッ!,"/>
<a zh_cn="DEEPs" zh_tw="DEEPs" jp="ディープス" keyword=",ディープス,"/>
<a zh_cn="Dogma" zh_tw="Dogma" jp="ドグマ" keyword=",ドグマ,"/>
<a zh_cn="宇宙企画" zh_tw="宇宙企画" jp="メディアステーション" keyword=",メディアステーション,"/>
<a zh_cn="WANZ FACTORY" zh_tw="WANZ FACTORY" jp="ワンズファクトリー" keyword=",ワンズファクトリー,"/>
<a zh_cn="VR PRODUCE" zh_tw="VR PRODUCE" jp="VRプロダクツ" keyword=",VRプロダクツ,VRPRODUCE,"/>
<a zh_cn="Real Works" zh_tw="Real Works" jp="レアルワークス" keyword=",レアルワークス,"/>
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="マックスエー" keyword=",マックスエー,"/>
<a zh_cn="PETERS MAX" zh_tw="PETERS MAX" jp="ピーターズMAX" keyword=",ピーターズMAX,"/>
<a zh_cn="NATURAL HIGH" zh_tw="NATURAL HIGH" jp="ナチュラルハイ" keyword=",ナチュラルハイ,"/>
<a zh_cn="MAXING" zh_tw="MAXING" jp="マキシング" keyword=",マキシング,"/>
<a zh_cn="Ms Video Group" zh_tw="Ms Video Group" jp="エムズビデオグループ" keyword=",エムズビデオグループ,"/>
<a zh_cn="Minimum" zh_tw="Minimum" jp="ミニマム" keyword=",ミニマム,"/>
<a zh_cn="WAAP Entertainment" zh_tw="WAAP Entertainment" jp="ワープエンタテインメント" keyword=",ワープエンタテインメント,"/>
<a zh_cn="pacopacomama" zh_tw="pacopacomama" jp="パコパコママ" keyword=",pacopacomama,パコパコママ,"/>
</info> </info>

View File

@@ -214,45 +214,6 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
cover_small = tmpArr[0].strip('\"').strip('\'') cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>| # ====================处理异常字符 END================== #\/:*?"<>|
# === 替换Studio片假名
studio = studio.replace('アイエナジー','Energy')
studio = studio.replace('アイデアポケット','Idea Pocket')
studio = studio.replace('アキノリ','AKNR')
studio = studio.replace('アタッカーズ','Attackers')
studio = re.sub('アパッチ.*','Apache',studio)
studio = studio.replace('アマチュアインディーズ','SOD')
studio = studio.replace('アリスJAPAN','Alice Japan')
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
studio = studio.replace('クリスタル映像','Crystal 映像')
studio = studio.replace('グローリークエスト','Glory Quest')
studio = studio.replace('ダスッ!','DAS')
studio = studio.replace('ディープス','DEEPs')
studio = studio.replace('ドグマ','Dogma')
studio = studio.replace('プレステージ','PRESTIGE')
studio = studio.replace('ムーディーズ','MOODYZ')
studio = studio.replace('メディアステーション','宇宙企画')
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
studio = studio.replace('エスワンナンバーワンスタイル','S1')
studio = studio.replace('SODクリエイト','SOD')
studio = studio.replace('サディスティックヴィレッジ','SOD')
studio = studio.replace('VRプロダクツ','VR PRODUCE')
studio = studio.replace('VRPRODUCE','VR PRODUCE')
studio = studio.replace('レアルワークス','Real Works')
studio = studio.replace('マックスエー','MAX-A')
studio = studio.replace('ピーターズMAX','PETERS MAX')
studio = studio.replace('プレミアム','PREMIUM')
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
studio = studio.replace('マキシング','MAXING')
studio = studio.replace('エムズビデオグループ','Ms Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
studio = studio.replace('パコパコママ','pacopacomama')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
# 返回处理后的json_data # 返回处理后的json_data
json_data['title'] = title json_data['title'] = title
json_data['original_title'] = title json_data['original_title'] = title
@@ -275,16 +236,14 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
for translate_value in translate_values: for translate_value in translate_values:
if json_data[translate_value] == "": if json_data[translate_value] == "":
continue continue
t = "" if translate_value == "title":
# if conf.get_transalte_engine() == "baidu": title_dict = json.load(
# json_data[translate_value] = translate( open(str(Path.home() / '.local' / 'share' / 'avdc' / 'c_number.json'), 'r', encoding="utf-8"))
# json_data[translate_value], try:
# target_language="zh", json_data[translate_value] = title_dict[number]
# engine=conf.get_transalte_engine(), continue
# app_id=conf.get_transalte_appId(), except:
# key=conf.get_transalte_key(), pass
# delay=conf.get_transalte_delay(),
# )
if conf.get_transalte_engine() == "azure": if conf.get_transalte_engine() == "azure":
t = translate( t = translate(
json_data[translate_value], json_data[translate_value],
@@ -309,9 +268,9 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
elif ccm == 2: elif ccm == 2:
json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=aa)[0] for aa in json_data['actor_list']] json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=aa)[0] for aa in json_data['actor_list']]
json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data['actor'])[0] json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data['actor'])[0]
# elif ccm == 3: elif ccm == 3:
# json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=aa)[0] for aa in json_data['actor_list']] json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=aa)[0] for aa in json_data['actor_list']]
# json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data['actor'])[0] json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data['actor'])[0]
except: except:
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']] json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
json_data['actor'] = oCC.convert(json_data['actor']) json_data['actor'] = oCC.convert(json_data['actor'])
@@ -323,16 +282,23 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
elif ccm == 2: elif ccm == 2:
json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=t)[0] for t in json_data[cc]] json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=t)[0] for t in json_data[cc]]
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
# elif ccm == 3: elif ccm == 3:
# json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=t)[0] for t in json_data[cc]] json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=t)[0] for t in json_data[cc]]
# #json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc]) json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
except: except:
json_data[cc] = [oCC.convert(t) for t in json_data[cc]] json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
# elif cc == "studio":
# elif cc == "series":
# elif cc == "label":
else: else:
try: try:
if ccm == 1:
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_cn', name=json_data[cc])[0]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data[cc])[0]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data[cc])[0]
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
except IndexError:
json_data[cc] = oCC.convert(json_data[cc]) json_data[cc] = oCC.convert(json_data[cc])
except: except:
pass pass

View File

@@ -53,25 +53,28 @@ def getStoryline(number, title, sites: list=None):
assert run_mode in (0,1,2) assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool: with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args) results = pool.map(getStoryline_mp, mp_args)
sel = ''
if not debug and conf.storyline_show() == 0: if not debug and conf.storyline_show() == 0:
for value in results: for value in results:
if isinstance(value, str) and len(value): if isinstance(value, str) and len(value):
return value if not is_japanese(value):
return '' return value
if not len(sel):
sel = value
return sel
# 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示 # 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
first = True sel_site = ''
sel = ''
for site, desc in zip(apply_sites, results): for site, desc in zip(apply_sites, results):
sl = len(desc) if isinstance(desc, str) else 0 sl = len(desc) if isinstance(desc, str) else 0
if sl and first: if not is_japanese(desc):
s += f'[选中{site}字数:{sl}]' sel_site, sel = site, desc
first = False break
sel = desc if sl and not len(sel_site):
elif sl: sel_site, sel = site, desc
s += f'{site}字数:{sl}' for site, desc in zip(apply_sites, results):
else: sl = len(desc) if isinstance(desc, str) else 0
s += f'{site}:空' s += f'[选中{site}字数:{sl}]' if site == sel_site else f'{site}字数:{sl}' if sl else f'{site}:空'
print(s) print(s)
return sel return sel
@@ -144,36 +147,36 @@ def getStoryline_airav(number, debug):
def getStoryline_airavwiki(number, debug): def getStoryline_airavwiki(number, debug):
try: try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search={kwd}' url = f'https://cn.airav.wiki/?search={kwd}'
result, session = get_html_session(url, return_type='session') result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True)
if not result: if not result.ok:
raise ValueError(f"get_html_session('{url}','{number}') failed") raise ValueError(f"get_html_by_browser('{url}','{number}') failed")
j = json.loads(result.content) s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block')
if int(j.get('count')) == 0:
raise ValueError("number not found")
link = None link = None
for r in j["result"]: for a in s:
n = r['barcode'] title = a.img['title']
if re.search(number, n, re.I): list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
link = urljoin(result.url, f'/api/video/barcode/{n}?lng=zh-TW') if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
if re.match(f'^{number}$', list_number, re.I):
link = a
break
elif re.search(number, list_number, re.I):
link = a
break break
if link is None: if link is None:
raise ValueError("number not found") raise ValueError("number not found")
result = session.get(link) result = browser.follow_link(link)
if not result.ok or not re.search(number, result.url, re.I): if not result.ok or not re.search(number, browser.url, re.I):
raise ValueError("detail page not found") raise ValueError("detail page not found")
j = json.loads(result.content) title = browser.page.select('head > title')[0].text.strip()
if int(j.get('count')) != 1: detail_number = str(re.findall('\[(.*?)]', title)[0])
raise ValueError("number not found")
detail_number = j["result"]['barcode']
if not re.search(number, detail_number, re.I): if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]") raise ValueError("detail page number not match, got ->[{detail_number}]")
desc = j["result"]['description'] desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
return desc return desc
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].") print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
pass pass
return '' return ''
@@ -196,7 +199,8 @@ def getStoryline_58avgo(number, debug):
link = None link = None
for a in s: for a in s:
title = a.h3.text.strip() title = a.h3.text.strip()
if re.search(number, title, re.I): list_number = title[title.rfind(' ')+1:].strip()
if re.search(number, list_number, re.I):
link = a link = a
break break
if link is None: if link is None:
@@ -227,9 +231,13 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()') titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs): if not descs or not len(descs):
raise ValueError(f"number not found") raise ValueError(f"number not found")
partial_num = bool(re.match(r'\d{6}[\-_]\d{2,3}', number))
for title, desc in zip(titles, descs): for title, desc in zip(titles, descs):
page_number = title[title.rfind(' '):].strip() page_number = title[title.rfind(' ')+1:].strip()
if re.search(number, page_number, re.I): if not partial_num:
if re.match(f'^{number}$', page_number, re.I):
return desc.strip()
elif re.search(number, page_number, re.I):
return desc.strip() return desc.strip()
raise ValueError(f"page number ->[{page_number}] not match") raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e: except Exception as e:
@@ -254,7 +262,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
s = browser.page.select('div.type_movie > div > ul > li > div') s = browser.page.select('div.type_movie > div > ul > li > div')
for div in s: for div in s:
title = div.a.h3.text.strip() title = div.a.h3.text.strip()
page_number = title[title.rfind(' '):].strip() page_number = title[title.rfind(' ')+1:].strip()
if re.search(number, page_number, re.I): if re.search(number, page_number, re.I):
return div['data-description'].strip() return div['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match") raise ValueError(f"page number ->[{page_number}] not match")

42
core.py Executable file → Normal file
View File

@@ -9,7 +9,6 @@ from PIL import Image
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from ADC_function import * from ADC_function import *
from WebCrawler import get_data_from_json from WebCrawler import get_data_from_json
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
if conf.debug(): if conf.debug():
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
def extrafanart_download_threadpool(url_list, save_dir, number): def extrafanart_download_threadpool(url_list, save_dir, number):
tm_start = time.perf_counter() tm_start = time.perf_counter()
conf = config.getInstance() conf = config.getInstance()
extrafanart_dir = Path(save_dir) / conf.get_extrafanart() extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
download_only_missing_images = conf.download_only_missing_images() download_only_missing_images = conf.download_only_missing_images()
mp_args = [] dn_list = []
for i, url in enumerate(url_list, start=1): for i, url in enumerate(url_list, start=1):
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg' jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
continue continue
mp_args.append((url, jpg_fullpath)) dn_list.append((url, jpg_fullpath))
if not len(mp_args): if not len(dn_list):
return return
extrafanart_dir.mkdir(parents=True, exist_ok=True) parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
if parallel > 100: if parallel > 100:
print('[!]Warrning: Parallel download thread too large may cause website ban IP!') print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
with ThreadPoolExecutor(parallel) as pool: result = parallel_download_files(dn_list, parallel)
result = list(pool.map(download_one_file, mp_args))
failed = 0 failed = 0
for i, r in enumerate(result, start=1): for i, r in enumerate(result, start=1):
if not r: if not r:
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
if conf.debug(): if conf.debug():
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
# 封面是否下载成功否则移动到failed # 封面是否下载成功否则移动到failed
def image_download(cover, number, leak_word, c_word, path, filepath): def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg" filename = f"{number}{leak_word}{c_word}-fanart.jpg"
@@ -299,7 +290,11 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
with open(nfo_path, "wt", encoding='UTF-8') as code: with open(nfo_path, "wt", encoding='UTF-8') as code:
print('<?xml version="1.0" encoding="UTF-8" ?>', file=code) print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
print("<movie>", file=code) print("<movie>", file=code)
print(" <title>" + naming_rule + "</title>", file=code) print(" <title>" + naming_rule + "</title>", file=code)
print(" <originaltitle>" + naming_rule + "</originaltitle>", file=code)
print(" <sorttitle>" + naming_rule + "</sorttitle>", file=code)
print(" <customrating>JP-18+</customrating>", file=code)
print(" <mpaa>JP-18+</mpaa>", file=code)
print(" <set>", file=code) print(" <set>", file=code)
print(" </set>", file=code) print(" </set>", file=code)
print(" <studio>" + studio + "</studio>", file=code) print(" <studio>" + studio + "</studio>", file=code)
@@ -314,7 +309,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
try: try:
for key in actor_list: for key in actor_list:
print(" <actor>", file=code) print(" <actor>", file=code)
print(" <name>" + key + "</name>", file=code) print(" <name>" + key + "</name>", file=code)
print(" </actor>", file=code) print(" </actor>", file=code)
except: except:
aaaa = '' aaaa = ''
@@ -346,6 +341,8 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
aaaaaaaa = '' aaaaaaaa = ''
print(" <num>" + number + "</num>", file=code) print(" <num>" + number + "</num>", file=code)
print(" <premiered>" + release + "</premiered>", file=code) print(" <premiered>" + release + "</premiered>", file=code)
print(" <releasedate>" + release + "</releasedate>", file=code)
print(" <release>" + release + "</release>", file=code)
print(" <cover>" + cover + "</cover>", file=code) print(" <cover>" + cover + "</cover>", file=code)
if config.getInstance().is_trailer(): if config.getInstance().is_trailer():
print(" <trailer>" + trailer + "</trailer>", file=code) print(" <trailer>" + trailer + "</trailer>", file=code)
@@ -564,16 +561,19 @@ def get_part(filepath):
def debug_print(data: json): def debug_print(data: json):
try: try:
print("[+] ---Debug info---") print("[+] ------- DEBUG INFO -------")
for i, v in data.items(): for i, v in data.items():
if i == 'outline': if i == 'outline':
print('[+] -', i, ' :', len(v), 'characters') print('[+] -', "%-14s" % i, ':', len(v), 'characters')
continue continue
if i == 'actor_photo' or i == 'year': if i == 'actor_photo' or i == 'year':
continue continue
print('[+] -', "%-11s" % i, ':', v) if i == 'extrafanart':
print('[+] -', "%-14s" % i, ':', len(v), 'links')
continue
print('[+] -', "%-14s" % i, ':', v)
print("[+] ---Debug info---") print("[+] ------- DEBUG INFO -------")
except: except:
pass pass

View File

@@ -4,7 +4,7 @@ import sys
import config import config
G_spat = re.compile( G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@", "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@|-uncensored|_uncensored|-leak|_leak",
re.IGNORECASE) re.IGNORECASE)
@@ -44,7 +44,10 @@ def get_number(debug,file_path: str) -> str:
lower_check = filename.lower() lower_check = filename.lower()
if 'fc2' in lower_check: if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) filename = re.sub("(-|_)cd\d{1,2}", "", filename, flags=re.IGNORECASE)
file_number = str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE)
return file_number.upper()
else: # 提取不含减号-的番号FANZA CID else: # 提取不含减号-的番号FANZA CID
# 欧美番号匹配规则 # 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
@@ -119,6 +122,15 @@ if __name__ == "__main__":
# import doctest # import doctest
# doctest.testmod(raise_on_error=True) # doctest.testmod(raise_on_error=True)
test_use_cases = ( test_use_cases = (
"MEYD-594-C.mp4",
"SSIS-001_C.mp4",
"SSIS100-C.mp4",
"SSIS101_C.mp4",
"ssni984.mp4",
"ssni666.mp4",
"SDDE-625_uncensored_C.mp4",
"SDDE-625_uncensored_leak_C.mp4",
"SDDE-625_uncensored_leak_C_cd1.mp4",
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取 "Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
"TokyoHot-n1287-HD SP2006 .mp4", "TokyoHot-n1287-HD SP2006 .mp4",
"caribean-020317_001.nfo", # -号误命名为_号的 "caribean-020317_001.nfo", # -号误命名为_号的