Merge branch 'master' of https://github.com/yoshiko2/av_data_capture
This commit is contained in:
63
ADC_function.py
Executable file → Normal file
63
ADC_function.py
Executable file → Normal file
@@ -15,6 +15,7 @@ import mechanicalsoup
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
from cloudscraper import create_scraper
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
||||
def getXpathSingle(htmlcode, xpath):
|
||||
@@ -136,9 +137,9 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur
|
||||
return None
|
||||
|
||||
|
||||
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, use_scraper: bool = False):
|
||||
configProxy = config.getInstance().proxy()
|
||||
s = requests.Session()
|
||||
s = create_scraper(browser={'custom': ua or G_USER_AGENT,}) if use_scraper else requests.Session()
|
||||
if isinstance(cookies, dict) and len(cookies):
|
||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||
@@ -238,9 +239,9 @@ def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, re
|
||||
result.encoding = encoding or "utf-8"
|
||||
return result.text
|
||||
except requests.exceptions.ProxyError:
|
||||
print("[-]get_html_session() Proxy error! Please check your Proxy")
|
||||
print("[-]get_html_by_scraper() Proxy error! Please check your Proxy")
|
||||
except Exception as e:
|
||||
print(f"[-]get_html_session() failed. {e}")
|
||||
print(f"[-]get_html_by_scraper() failed. {e}")
|
||||
return None
|
||||
|
||||
|
||||
@@ -298,27 +299,6 @@ f"https://{gsite}/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={t
|
||||
|
||||
translate_list = [i["trans"] for i in result.json()["sentences"]]
|
||||
trans_result = trans_result.join(translate_list)
|
||||
# elif engine == "baidu":
|
||||
# url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
|
||||
# salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288)
|
||||
# sign = app_id + src + str(salt) + key
|
||||
# sign = hashlib.md5(sign.encode()).hexdigest()
|
||||
# url += (
|
||||
# "?appid="
|
||||
# + app_id
|
||||
# + "&q="
|
||||
# + src
|
||||
# + "&from=auto&to="
|
||||
# + target_language
|
||||
# + "&salt="
|
||||
# + str(salt)
|
||||
# + "&sign="
|
||||
# + sign
|
||||
# )
|
||||
# result = get_html(url=url, return_type="object")
|
||||
#
|
||||
# translate_list = [i["dst"] for i in result.json()["trans_result"]]
|
||||
# trans_result = trans_result.join(translate_list)
|
||||
elif engine == "azure":
|
||||
url = "https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&to=" + target_language
|
||||
headers = {
|
||||
@@ -490,9 +470,40 @@ def download_file_with_filename(url, filename, path):
|
||||
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
|
||||
return
|
||||
|
||||
def download_one_file(args):
|
||||
def _inner(url: str, save_path: Path):
|
||||
filebytes = get_html(url, return_type='content')
|
||||
if isinstance(filebytes, bytes) and len(filebytes):
|
||||
if len(filebytes) == save_path.open('wb').write(filebytes):
|
||||
return str(save_path)
|
||||
return _inner(*args)
|
||||
|
||||
'''用法示例: 2线程同时下载两个不同文件,并保存到不同路径,路径目录可未创建,但需要具备对目标目录和文件的写权限
|
||||
parallel_download_files([
|
||||
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
|
||||
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
|
||||
])
|
||||
'''
|
||||
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
|
||||
# parallel: 并行下载的线程池线程数,为0则由函数自己决定
|
||||
def parallel_download_files(dn_list, parallel: int = 0):
|
||||
mp_args = []
|
||||
for url, fullpath in dn_list:
|
||||
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
|
||||
fullpath = Path(fullpath)
|
||||
fullpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
mp_args.append((url, fullpath))
|
||||
if not len(mp_args):
|
||||
return []
|
||||
if not isinstance(parallel, int) or parallel not in range(1,200):
|
||||
parallel = min(5, len(mp_args))
|
||||
with ThreadPoolExecutor(parallel) as pool:
|
||||
results = list(pool.map(download_one_file, mp_args))
|
||||
return results
|
||||
|
||||
def delete_all_elements_in_list(string,lists):
|
||||
new_lists = []
|
||||
for i in lists:
|
||||
if i != string:
|
||||
new_lists.append(i)
|
||||
return new_lists
|
||||
return new_lists
|
||||
|
||||
@@ -14,7 +14,7 @@ import config
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
from pathlib import Path
|
||||
from ADC_function import file_modification_days, get_html
|
||||
from ADC_function import file_modification_days, get_html, parallel_download_files
|
||||
from number_parser import get_number
|
||||
from core import core_main, moveFailedFolder
|
||||
|
||||
@@ -473,18 +473,24 @@ def main():
|
||||
if conf.update_check():
|
||||
check_update(version)
|
||||
|
||||
# Download Mapping Table
|
||||
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')):
|
||||
ADC_function.download_file_with_filename(
|
||||
# Download Mapping Table, parallel version
|
||||
down_map_tab = []
|
||||
actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
|
||||
if not actor_xml.exists():
|
||||
down_map_tab.append((
|
||||
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
|
||||
"mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
|
||||
print("[+] [1/2] Mapping Table Downloaded")
|
||||
|
||||
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')):
|
||||
ADC_function.download_file_with_filename(
|
||||
actor_xml))
|
||||
info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
|
||||
if not info_xml.exists():
|
||||
down_map_tab.append((
|
||||
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
|
||||
"mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
|
||||
print("[+] [2/2] Mapping Table Downloaded")
|
||||
info_xml))
|
||||
res = parallel_download_files(down_map_tab)
|
||||
for i, fp in enumerate(res, start=1):
|
||||
if fp and len(fp):
|
||||
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
|
||||
else:
|
||||
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
|
||||
|
||||
print(f"[+]Load Config file '{conf.ini_path}'.")
|
||||
if conf.debug():
|
||||
|
||||
14598
MappingTable/c_number.json
Normal file
14598
MappingTable/c_number.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -3,6 +3,9 @@
|
||||
keyword:用于匹配标签/导演/系列/制作/发行的关键词,每个名字前后都需要用逗号隔开。当其中包含刮削得到的关键词时,可以输出对应语言的词。
|
||||
zh_cn/zh_tw/jp:指对应语言输出的词,按设置的对应语言输出。当输出词为“删除”时表示:遇到该关键词时,在对应内容中删除该关键词-->
|
||||
<info>
|
||||
<!-- ======================== -->
|
||||
<!-- 删除 -->
|
||||
<!-- ======================== -->
|
||||
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",成人奖,"/>
|
||||
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",觸摸打字,触摸打字,"/>
|
||||
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",10枚組,"/>
|
||||
@@ -46,6 +49,9 @@ zh_cn/zh_tw/jp:指对应语言输出的词,按设置的对应语言输出。
|
||||
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",亞洲,亚洲,"/>
|
||||
<a zh_cn="删除" zh_tw="删除" jp="删除" keyword=",ハロウィーンキャンペーン,"/>
|
||||
|
||||
<!-- ======================== -->
|
||||
<!-- 标签 -->
|
||||
<!-- ======================== -->
|
||||
<a zh_cn="16小时+" zh_tw="16小時+" jp="16時間以上作品" keyword=",16小時以上作品,16小时以上作品,16時間以上作品,16小时+,16小時+,"/>
|
||||
<a zh_cn="3D" zh_tw="3D" jp="3D" keyword=",3D,"/>
|
||||
<a zh_cn="3D卡通" zh_tw="3D卡通" jp="3Dエロアニメ" keyword=",3D卡通,3Dエロアニメ,"/>
|
||||
@@ -377,12 +383,16 @@ zh_cn/zh_tw/jp:指对应语言输出的词,按设置的对应语言输出。
|
||||
<a zh_cn="两男一女" zh_tw="兩男一女" jp="兩男一女" keyword=",兩男一女,两男一女,"/>
|
||||
<a zh_cn="3P" zh_tw="3P" jp="3P" keyword=",3P,3p,3P,3p,"/>
|
||||
<a zh_cn="唾液敷面" zh_tw="唾液敷面" jp="唾液敷面" keyword=",唾液敷面,"/>
|
||||
|
||||
<!-- ======================== -->
|
||||
<!-- 厂商 -->
|
||||
<!-- ======================== -->
|
||||
<a zh_cn="kira☆kira" zh_tw="kira☆kira" jp="kira☆kira" keyword=",kira☆kira,"/>
|
||||
<a zh_cn="S1 NO.1 STYLE" zh_tw="S1 NO.1 STYLE" jp="S1 NO.1 STYLE" keyword=",S1 Style,エスワン,エスワンナンバーワンスタイル,S1 NO.1 STYLE,S1NO.1STYLE,"/>
|
||||
<a zh_cn="S1 NO.1 STYLE" zh_tw="S1 NO.1 STYLE" jp="S1 NO.1 STYLE" keyword=",S1 Style,エスワン,エスワン ナンバーワンスタイル,エスワンナンバーワンスタイル,S1 NO.1 STYLE,S1NO.1STYLE,"/>
|
||||
<a zh_cn="一本道" zh_tw="一本道" jp="一本道" keyword=",一本道,"/>
|
||||
<a zh_cn="加勒比" zh_tw="加勒比" jp="加勒比" keyword=",加勒比,カリビアンコム,"/>
|
||||
<a zh_cn="东京热" zh_tw="東京熱" jp="TOKYO-HOT" keyword=",东京热,東京熱,東熱,TOKYO-HOT,"/>
|
||||
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,"/>
|
||||
<a zh_cn="SOD" zh_tw="SOD" jp="SOD" keyword=",SOD,SODクリエイト,サディスティックヴィレッジ,"/>
|
||||
<a zh_cn="PRESTIGE" zh_tw="PRESTIGE" jp="PRESTIGE" keyword=",PRESTIGE,プレステージ,"/>
|
||||
<a zh_cn="MOODYZ" zh_tw="MOODYZ" jp="MOODYZ" keyword=",MOODYZ,ムーディーズ,"/>
|
||||
<a zh_cn="ROCKET" zh_tw="ROCKET" jp="ROCKET" keyword=",ROCKET,"/>
|
||||
@@ -407,5 +417,28 @@ zh_cn/zh_tw/jp:指对应语言输出的词,按设置的对应语言输出。
|
||||
<a zh_cn="WANZ" zh_tw="WANZ" jp="WANZ" keyword=",WANZ,ワンズファクトリー,"/>
|
||||
<a zh_cn="BeFree" zh_tw="BeFree" jp="BeFree" keyword=",BeFree,"/>
|
||||
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="MAX-A" keyword=",MAX-A,マックスエー,"/>
|
||||
|
||||
<!-- 2021-11-8 Update -->
|
||||
<a zh_cn="Energy" zh_tw="Energy" jp="アイエナジー" keyword=",アイエナジー,"/>
|
||||
<a zh_cn="Idea Pocket" zh_tw="Idea Pocket" jp="アイデアポケット" keyword=",アイデアポケット,"/>
|
||||
<a zh_cn="AKNR" zh_tw="AKNR" jp="アキノリ" keyword=",アキノリ,"/>
|
||||
<a zh_cn="Attackers" zh_tw="Attackers" jp="アタッカーズ" keyword=",アタッカーズ,"/>
|
||||
<a zh_cn="Alice Japan" zh_tw="Alice Japan" jp="アリスJAPAN" keyword=",アリスJAPAN,"/>
|
||||
<a zh_cn="Aurora Project Annex" zh_tw="Aurora Project Annex" jp="オーロラプロジェクト・アネックス" keyword=",オーロラプロジェクト・アネックス,"/>
|
||||
<a zh_cn="Crystal 映像" zh_tw="Crystal 映像" jp="クリスタル映像" keyword=",クリスタル映像,"/>
|
||||
<a zh_cn="Glory Quest" zh_tw="Glory Quest" jp="グローリークエスト" keyword=",グローリークエスト,"/>
|
||||
<a zh_cn="DAS!" zh_tw="DAS!" jp="ダスッ!" keyword=",ダスッ!,"/>
|
||||
<a zh_cn="DEEP’s" zh_tw="DEEP’s" jp="ディープス" keyword=",ディープス,"/>
|
||||
<a zh_cn="Dogma" zh_tw="Dogma" jp="ドグマ" keyword=",ドグマ,"/>
|
||||
<a zh_cn="宇宙企画" zh_tw="宇宙企画" jp="メディアステーション" keyword=",メディアステーション,"/>
|
||||
<a zh_cn="WANZ FACTORY" zh_tw="WANZ FACTORY" jp="ワンズファクトリー" keyword=",ワンズファクトリー,"/>
|
||||
<a zh_cn="V&R PRODUCE" zh_tw="V&R PRODUCE" jp="V&Rプロダクツ" keyword=",V&Rプロダクツ,V&RPRODUCE,"/>
|
||||
<a zh_cn="Real Works" zh_tw="Real Works" jp="レアルワークス" keyword=",レアルワークス,"/>
|
||||
<a zh_cn="MAX-A" zh_tw="MAX-A" jp="マックスエー" keyword=",マックスエー,"/>
|
||||
<a zh_cn="PETERS MAX" zh_tw="PETERS MAX" jp="ピーターズMAX" keyword=",ピーターズMAX,"/>
|
||||
<a zh_cn="NATURAL HIGH" zh_tw="NATURAL HIGH" jp="ナチュラルハイ" keyword=",ナチュラルハイ,"/>
|
||||
<a zh_cn="MAXING" zh_tw="MAXING" jp="マキシング" keyword=",マキシング,"/>
|
||||
<a zh_cn="M’s Video Group" zh_tw="M’s Video Group" jp="エムズビデオグループ" keyword=",エムズビデオグループ,"/>
|
||||
<a zh_cn="Minimum" zh_tw="Minimum" jp="ミニマム" keyword=",ミニマム,"/>
|
||||
<a zh_cn="WAAP Entertainment" zh_tw="WAAP Entertainment" jp="ワープエンタテインメント" keyword=",ワープエンタテインメント,"/>
|
||||
<a zh_cn="pacopacomama" zh_tw="pacopacomama" jp="パコパコママ" keyword=",pacopacomama,パコパコママ,"/>
|
||||
</info>
|
||||
|
||||
@@ -214,45 +214,6 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
cover_small = tmpArr[0].strip('\"').strip('\'')
|
||||
# ====================处理异常字符 END================== #\/:*?"<>|
|
||||
|
||||
# === 替换Studio片假名
|
||||
studio = studio.replace('アイエナジー','Energy')
|
||||
studio = studio.replace('アイデアポケット','Idea Pocket')
|
||||
studio = studio.replace('アキノリ','AKNR')
|
||||
studio = studio.replace('アタッカーズ','Attackers')
|
||||
studio = re.sub('アパッチ.*','Apache',studio)
|
||||
studio = studio.replace('アマチュアインディーズ','SOD')
|
||||
studio = studio.replace('アリスJAPAN','Alice Japan')
|
||||
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
|
||||
studio = studio.replace('クリスタル映像','Crystal 映像')
|
||||
studio = studio.replace('グローリークエスト','Glory Quest')
|
||||
studio = studio.replace('ダスッ!','DAS!')
|
||||
studio = studio.replace('ディープス','DEEP’s')
|
||||
studio = studio.replace('ドグマ','Dogma')
|
||||
studio = studio.replace('プレステージ','PRESTIGE')
|
||||
studio = studio.replace('ムーディーズ','MOODYZ')
|
||||
studio = studio.replace('メディアステーション','宇宙企画')
|
||||
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
|
||||
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
|
||||
studio = studio.replace('エスワンナンバーワンスタイル','S1')
|
||||
studio = studio.replace('SODクリエイト','SOD')
|
||||
studio = studio.replace('サディスティックヴィレッジ','SOD')
|
||||
studio = studio.replace('V&Rプロダクツ','V&R PRODUCE')
|
||||
studio = studio.replace('V&RPRODUCE','V&R PRODUCE')
|
||||
studio = studio.replace('レアルワークス','Real Works')
|
||||
studio = studio.replace('マックスエー','MAX-A')
|
||||
studio = studio.replace('ピーターズMAX','PETERS MAX')
|
||||
studio = studio.replace('プレミアム','PREMIUM')
|
||||
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
|
||||
studio = studio.replace('マキシング','MAXING')
|
||||
studio = studio.replace('エムズビデオグループ','M’s Video Group')
|
||||
studio = studio.replace('ミニマム','Minimum')
|
||||
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
|
||||
studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
|
||||
studio = studio.replace('パコパコママ','pacopacomama')
|
||||
studio = re.sub('.*/妄想族','妄想族',studio)
|
||||
studio = studio.replace('/',' ')
|
||||
# === 替换Studio片假名 END
|
||||
|
||||
# 返回处理后的json_data
|
||||
json_data['title'] = title
|
||||
json_data['original_title'] = title
|
||||
@@ -275,16 +236,14 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
for translate_value in translate_values:
|
||||
if json_data[translate_value] == "":
|
||||
continue
|
||||
t = ""
|
||||
# if conf.get_transalte_engine() == "baidu":
|
||||
# json_data[translate_value] = translate(
|
||||
# json_data[translate_value],
|
||||
# target_language="zh",
|
||||
# engine=conf.get_transalte_engine(),
|
||||
# app_id=conf.get_transalte_appId(),
|
||||
# key=conf.get_transalte_key(),
|
||||
# delay=conf.get_transalte_delay(),
|
||||
# )
|
||||
if translate_value == "title":
|
||||
title_dict = json.load(
|
||||
open(str(Path.home() / '.local' / 'share' / 'avdc' / 'c_number.json'), 'r', encoding="utf-8"))
|
||||
try:
|
||||
json_data[translate_value] = title_dict[number]
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
if conf.get_transalte_engine() == "azure":
|
||||
t = translate(
|
||||
json_data[translate_value],
|
||||
@@ -309,9 +268,9 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
elif ccm == 2:
|
||||
json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=aa)[0] for aa in json_data['actor_list']]
|
||||
json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data['actor'])[0]
|
||||
# elif ccm == 3:
|
||||
# json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=aa)[0] for aa in json_data['actor_list']]
|
||||
# json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data['actor'])[0]
|
||||
elif ccm == 3:
|
||||
json_data['actor_list'] = [actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=aa)[0] for aa in json_data['actor_list']]
|
||||
json_data['actor'] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data['actor'])[0]
|
||||
except:
|
||||
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
|
||||
json_data['actor'] = oCC.convert(json_data['actor'])
|
||||
@@ -323,16 +282,23 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
elif ccm == 2:
|
||||
json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=t)[0] for t in json_data[cc]]
|
||||
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
|
||||
# elif ccm == 3:
|
||||
# json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=t)[0] for t in json_data[cc]]
|
||||
# #json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
elif ccm == 3:
|
||||
json_data[cc] = [info_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=t)[0] for t in json_data[cc]]
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
except:
|
||||
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
|
||||
# elif cc == "studio":
|
||||
# elif cc == "series":
|
||||
# elif cc == "label":
|
||||
else:
|
||||
try:
|
||||
if ccm == 1:
|
||||
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_cn', name=json_data[cc])[0]
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
elif ccm == 2:
|
||||
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@zh_tw', name=json_data[cc])[0]
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
elif ccm == 3:
|
||||
json_data[cc] = actor_mapping_data.xpath('a[contains(@keyword, $name)]/@jp', name=json_data[cc])[0]
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
except IndexError:
|
||||
json_data[cc] = oCC.convert(json_data[cc])
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -53,25 +53,28 @@ def getStoryline(number, title, sites: list=None):
|
||||
assert run_mode in (0,1,2)
|
||||
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
|
||||
results = pool.map(getStoryline_mp, mp_args)
|
||||
sel = ''
|
||||
if not debug and conf.storyline_show() == 0:
|
||||
for value in results:
|
||||
if isinstance(value, str) and len(value):
|
||||
return value
|
||||
return ''
|
||||
if not is_japanese(value):
|
||||
return value
|
||||
if not len(sel):
|
||||
sel = value
|
||||
return sel
|
||||
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
|
||||
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||
first = True
|
||||
sel = ''
|
||||
sel_site = ''
|
||||
for site, desc in zip(apply_sites, results):
|
||||
sl = len(desc) if isinstance(desc, str) else 0
|
||||
if sl and first:
|
||||
s += f',[选中{site}字数:{sl}]'
|
||||
first = False
|
||||
sel = desc
|
||||
elif sl:
|
||||
s += f',{site}字数:{sl}'
|
||||
else:
|
||||
s += f',{site}:空'
|
||||
if not is_japanese(desc):
|
||||
sel_site, sel = site, desc
|
||||
break
|
||||
if sl and not len(sel_site):
|
||||
sel_site, sel = site, desc
|
||||
for site, desc in zip(apply_sites, results):
|
||||
sl = len(desc) if isinstance(desc, str) else 0
|
||||
s += f',[选中{site}字数:{sl}]' if site == sel_site else f',{site}字数:{sl}' if sl else f',{site}:空'
|
||||
print(s)
|
||||
return sel
|
||||
|
||||
@@ -144,36 +147,36 @@ def getStoryline_airav(number, debug):
|
||||
def getStoryline_airavwiki(number, debug):
|
||||
try:
|
||||
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
|
||||
url = f'https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search={kwd}'
|
||||
result, session = get_html_session(url, return_type='session')
|
||||
if not result:
|
||||
raise ValueError(f"get_html_session('{url}','{number}') failed")
|
||||
j = json.loads(result.content)
|
||||
if int(j.get('count')) == 0:
|
||||
raise ValueError("number not found")
|
||||
url = f'https://cn.airav.wiki/?search={kwd}'
|
||||
result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True)
|
||||
if not result.ok:
|
||||
raise ValueError(f"get_html_by_browser('{url}','{number}') failed")
|
||||
s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block')
|
||||
link = None
|
||||
for r in j["result"]:
|
||||
n = r['barcode']
|
||||
if re.search(number, n, re.I):
|
||||
link = urljoin(result.url, f'/api/video/barcode/{n}?lng=zh-TW')
|
||||
for a in s:
|
||||
title = a.img['title']
|
||||
list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
|
||||
if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
|
||||
if re.match(f'^{number}$', list_number, re.I):
|
||||
link = a
|
||||
break
|
||||
elif re.search(number, list_number, re.I):
|
||||
link = a
|
||||
break
|
||||
if link is None:
|
||||
raise ValueError("number not found")
|
||||
result = session.get(link)
|
||||
if not result.ok or not re.search(number, result.url, re.I):
|
||||
result = browser.follow_link(link)
|
||||
if not result.ok or not re.search(number, browser.url, re.I):
|
||||
raise ValueError("detail page not found")
|
||||
j = json.loads(result.content)
|
||||
if int(j.get('count')) != 1:
|
||||
raise ValueError("number not found")
|
||||
detail_number = j["result"]['barcode']
|
||||
title = browser.page.select('head > title')[0].text.strip()
|
||||
detail_number = str(re.findall('\[(.*?)]', title)[0])
|
||||
if not re.search(number, detail_number, re.I):
|
||||
raise ValueError("detail page number not match, got ->[{detail_number}]")
|
||||
desc = j["result"]['description']
|
||||
desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
|
||||
return desc
|
||||
|
||||
except Exception as e:
|
||||
if debug:
|
||||
print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].")
|
||||
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
|
||||
pass
|
||||
return ''
|
||||
|
||||
@@ -196,7 +199,8 @@ def getStoryline_58avgo(number, debug):
|
||||
link = None
|
||||
for a in s:
|
||||
title = a.h3.text.strip()
|
||||
if re.search(number, title, re.I):
|
||||
list_number = title[title.rfind(' ')+1:].strip()
|
||||
if re.search(number, list_number, re.I):
|
||||
link = a
|
||||
break
|
||||
if link is None:
|
||||
@@ -227,9 +231,13 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
|
||||
if not descs or not len(descs):
|
||||
raise ValueError(f"number not found")
|
||||
partial_num = bool(re.match(r'\d{6}[\-_]\d{2,3}', number))
|
||||
for title, desc in zip(titles, descs):
|
||||
page_number = title[title.rfind(' '):].strip()
|
||||
if re.search(number, page_number, re.I):
|
||||
page_number = title[title.rfind(' ')+1:].strip()
|
||||
if not partial_num:
|
||||
if re.match(f'^{number}$', page_number, re.I):
|
||||
return desc.strip()
|
||||
elif re.search(number, page_number, re.I):
|
||||
return desc.strip()
|
||||
raise ValueError(f"page number ->[{page_number}] not match")
|
||||
except Exception as e:
|
||||
@@ -254,7 +262,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||
s = browser.page.select('div.type_movie > div > ul > li > div')
|
||||
for div in s:
|
||||
title = div.a.h3.text.strip()
|
||||
page_number = title[title.rfind(' '):].strip()
|
||||
page_number = title[title.rfind(' ')+1:].strip()
|
||||
if re.search(number, page_number, re.I):
|
||||
return div['data-description'].strip()
|
||||
raise ValueError(f"page number ->[{page_number}] not match")
|
||||
|
||||
42
core.py
Executable file → Normal file
42
core.py
Executable file → Normal file
@@ -9,7 +9,6 @@ from PIL import Image
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from ADC_function import *
|
||||
from WebCrawler import get_data_from_json
|
||||
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
|
||||
if conf.debug():
|
||||
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
|
||||
|
||||
def download_one_file(args):
|
||||
def _inner(url: str, save_path: Path):
|
||||
filebytes = get_html(url, return_type='content')
|
||||
if isinstance(filebytes, bytes) and len(filebytes):
|
||||
if len(filebytes) == save_path.open('wb').write(filebytes):
|
||||
return str(save_path)
|
||||
return _inner(*args)
|
||||
|
||||
def extrafanart_download_threadpool(url_list, save_dir, number):
|
||||
tm_start = time.perf_counter()
|
||||
conf = config.getInstance()
|
||||
extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
|
||||
download_only_missing_images = conf.download_only_missing_images()
|
||||
mp_args = []
|
||||
dn_list = []
|
||||
for i, url in enumerate(url_list, start=1):
|
||||
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
|
||||
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
|
||||
continue
|
||||
mp_args.append((url, jpg_fullpath))
|
||||
if not len(mp_args):
|
||||
dn_list.append((url, jpg_fullpath))
|
||||
if not len(dn_list):
|
||||
return
|
||||
extrafanart_dir.mkdir(parents=True, exist_ok=True)
|
||||
parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
|
||||
parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
|
||||
if parallel > 100:
|
||||
print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
|
||||
with ThreadPoolExecutor(parallel) as pool:
|
||||
result = list(pool.map(download_one_file, mp_args))
|
||||
result = parallel_download_files(dn_list, parallel)
|
||||
failed = 0
|
||||
for i, r in enumerate(result, start=1):
|
||||
if not r:
|
||||
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
|
||||
if conf.debug():
|
||||
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
|
||||
|
||||
|
||||
# 封面是否下载成功,否则移动到failed
|
||||
def image_download(cover, number, leak_word, c_word, path, filepath):
|
||||
filename = f"{number}{leak_word}{c_word}-fanart.jpg"
|
||||
@@ -299,7 +290,11 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
|
||||
with open(nfo_path, "wt", encoding='UTF-8') as code:
|
||||
print('<?xml version="1.0" encoding="UTF-8" ?>', file=code)
|
||||
print("<movie>", file=code)
|
||||
print(" <title>" + naming_rule + "</title>", file=code)
|
||||
print(" <title>" + naming_rule + "</title>", file=code)
|
||||
print(" <originaltitle>" + naming_rule + "</originaltitle>", file=code)
|
||||
print(" <sorttitle>" + naming_rule + "</sorttitle>", file=code)
|
||||
print(" <customrating>JP-18+</customrating>", file=code)
|
||||
print(" <mpaa>JP-18+</mpaa>", file=code)
|
||||
print(" <set>", file=code)
|
||||
print(" </set>", file=code)
|
||||
print(" <studio>" + studio + "</studio>", file=code)
|
||||
@@ -314,7 +309,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
|
||||
try:
|
||||
for key in actor_list:
|
||||
print(" <actor>", file=code)
|
||||
print(" <name>" + key + "</name>", file=code)
|
||||
print(" <name>" + key + "</name>", file=code)
|
||||
print(" </actor>", file=code)
|
||||
except:
|
||||
aaaa = ''
|
||||
@@ -346,6 +341,8 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
|
||||
aaaaaaaa = ''
|
||||
print(" <num>" + number + "</num>", file=code)
|
||||
print(" <premiered>" + release + "</premiered>", file=code)
|
||||
print(" <releasedate>" + release + "</releasedate>", file=code)
|
||||
print(" <release>" + release + "</release>", file=code)
|
||||
print(" <cover>" + cover + "</cover>", file=code)
|
||||
if config.getInstance().is_trailer():
|
||||
print(" <trailer>" + trailer + "</trailer>", file=code)
|
||||
@@ -564,16 +561,19 @@ def get_part(filepath):
|
||||
|
||||
def debug_print(data: json):
|
||||
try:
|
||||
print("[+] ---Debug info---")
|
||||
print("[+] ------- DEBUG INFO -------")
|
||||
for i, v in data.items():
|
||||
if i == 'outline':
|
||||
print('[+] -', i, ' :', len(v), 'characters')
|
||||
print('[+] -', "%-14s" % i, ':', len(v), 'characters')
|
||||
continue
|
||||
if i == 'actor_photo' or i == 'year':
|
||||
continue
|
||||
print('[+] -', "%-11s" % i, ':', v)
|
||||
if i == 'extrafanart':
|
||||
print('[+] -', "%-14s" % i, ':', len(v), 'links')
|
||||
continue
|
||||
print('[+] -', "%-14s" % i, ':', v)
|
||||
|
||||
print("[+] ---Debug info---")
|
||||
print("[+] ------- DEBUG INFO -------")
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import sys
|
||||
import config
|
||||
|
||||
G_spat = re.compile(
|
||||
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
|
||||
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@|-uncensored|_uncensored|-leak|_leak",
|
||||
re.IGNORECASE)
|
||||
|
||||
|
||||
@@ -44,7 +44,10 @@ def get_number(debug,file_path: str) -> str:
|
||||
lower_check = filename.lower()
|
||||
if 'fc2' in lower_check:
|
||||
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
|
||||
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
|
||||
filename = re.sub("(-|_)cd\d{1,2}", "", filename, flags=re.IGNORECASE)
|
||||
file_number = str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
|
||||
file_number = re.sub("(-|_)c$", "", file_number, flags=re.IGNORECASE)
|
||||
return file_number.upper()
|
||||
else: # 提取不含减号-的番号,FANZA CID
|
||||
# 欧美番号匹配规则
|
||||
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
|
||||
@@ -119,6 +122,15 @@ if __name__ == "__main__":
|
||||
# import doctest
|
||||
# doctest.testmod(raise_on_error=True)
|
||||
test_use_cases = (
|
||||
"MEYD-594-C.mp4",
|
||||
"SSIS-001_C.mp4",
|
||||
"SSIS100-C.mp4",
|
||||
"SSIS101_C.mp4",
|
||||
"ssni984.mp4",
|
||||
"ssni666.mp4",
|
||||
"SDDE-625_uncensored_C.mp4",
|
||||
"SDDE-625_uncensored_leak_C.mp4",
|
||||
"SDDE-625_uncensored_leak_C_cd1.mp4",
|
||||
"Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
|
||||
"TokyoHot-n1287-HD SP2006 .mp4",
|
||||
"caribean-020317_001.nfo", # -号误命名为_号的
|
||||
|
||||
Reference in New Issue
Block a user