Merge pull request #629 from lededev/enumerate-1
storyline:add data source airavwiki
This commit is contained in:
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -42,6 +42,7 @@ jobs:
|
|||||||
--hidden-import ADC_function.py \
|
--hidden-import ADC_function.py \
|
||||||
--hidden-import core.py \
|
--hidden-import core.py \
|
||||||
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
|
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
|
||||||
|
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
|
||||||
--add-data "Img:Img" \
|
--add-data "Img:Img" \
|
||||||
--add-data "config.ini:." \
|
--add-data "config.ini:." \
|
||||||
|
|
||||||
@@ -53,6 +54,7 @@ jobs:
|
|||||||
--hidden-import ADC_function.py `
|
--hidden-import ADC_function.py `
|
||||||
--hidden-import core.py `
|
--hidden-import core.py `
|
||||||
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
|
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
|
||||||
|
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" `
|
||||||
--add-data "Img;Img" `
|
--add-data "Img;Img" `
|
||||||
--add-data "config.ini;." `
|
--add-data "config.ini;." `
|
||||||
|
|
||||||
|
|||||||
203
ADC_function.py
203
ADC_function.py
@@ -14,6 +14,7 @@ from urllib.parse import urljoin
|
|||||||
import mechanicalsoup
|
import mechanicalsoup
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from urllib3.util.retry import Retry
|
from urllib3.util.retry import Retry
|
||||||
|
from cloudscraper import create_scraper
|
||||||
|
|
||||||
|
|
||||||
def getXpathSingle(htmlcode, xpath):
|
def getXpathSingle(htmlcode, xpath):
|
||||||
@@ -22,10 +23,10 @@ def getXpathSingle(htmlcode, xpath):
|
|||||||
return result1
|
return result1
|
||||||
|
|
||||||
|
|
||||||
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
|
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
|
||||||
|
|
||||||
# 网页请求核心
|
# 网页请求核心
|
||||||
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
|
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||||
verify = config.getInstance().cacert_file()
|
verify = config.getInstance().cacert_file()
|
||||||
configProxy = config.getInstance().proxy()
|
configProxy = config.getInstance().proxy()
|
||||||
errors = ""
|
errors = ""
|
||||||
@@ -41,13 +42,12 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None)
|
|||||||
else:
|
else:
|
||||||
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
|
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
|
||||||
|
|
||||||
result.encoding = "utf-8"
|
|
||||||
|
|
||||||
if return_type == "object":
|
if return_type == "object":
|
||||||
return result
|
return result
|
||||||
elif return_type == "content":
|
elif return_type == "content":
|
||||||
return result.content
|
return result.content
|
||||||
else:
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
return result.text
|
return result.text
|
||||||
except requests.exceptions.ProxyError:
|
except requests.exceptions.ProxyError:
|
||||||
print("[-]Proxy error! Please check your Proxy")
|
print("[-]Proxy error! Please check your Proxy")
|
||||||
@@ -98,59 +98,150 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|||||||
kwargs["timeout"] = self.timeout
|
kwargs["timeout"] = self.timeout
|
||||||
return super().send(request, **kwargs)
|
return super().send(request, **kwargs)
|
||||||
|
|
||||||
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
|
|
||||||
|
# with keep-alive feature
|
||||||
|
def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||||
|
configProxy = config.getInstance().proxy()
|
||||||
|
session = requests.Session()
|
||||||
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
|
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
|
if configProxy.enable:
|
||||||
|
session.verify = config.getInstance().cacert_file()
|
||||||
|
session.proxies = configProxy.proxies()
|
||||||
|
headers = {"User-Agent": ua or G_USER_AGENT}
|
||||||
|
session.headers = headers
|
||||||
|
try:
|
||||||
|
if isinstance(url, str) and len(url):
|
||||||
|
result = session.get(str(url))
|
||||||
|
else: # 空url参数直接返回可重用session对象,无需设置return_type
|
||||||
|
return session
|
||||||
|
if not result.ok:
|
||||||
|
return None
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
elif return_type == "session":
|
||||||
|
return result, session
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return result.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_session() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-]get_html_session() failed. {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||||
configProxy = config.getInstance().proxy()
|
configProxy = config.getInstance().proxy()
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||||
retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||||
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
if configProxy.enable:
|
if configProxy.enable:
|
||||||
|
s.verify = config.getInstance().cacert_file()
|
||||||
s.proxies = configProxy.proxies()
|
s.proxies = configProxy.proxies()
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
try:
|
||||||
result = browser.open(url)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
||||||
if not result.ok:
|
if isinstance(url, str) and len(url):
|
||||||
return ''
|
result = browser.open(url)
|
||||||
result.encoding = "utf-8"
|
else:
|
||||||
if return_type == "object":
|
return browser
|
||||||
return result
|
if not result.ok:
|
||||||
elif return_type == "content":
|
return None
|
||||||
return result.content
|
|
||||||
elif return_type == "browser":
|
if return_type == "object":
|
||||||
return result, browser
|
return result
|
||||||
else:
|
elif return_type == "content":
|
||||||
return result.text
|
return result.content
|
||||||
|
elif return_type == "browser":
|
||||||
|
return result, browser
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return result.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[-]get_html_by_browser() Failed! {e}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
|
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||||
configProxy = config.getInstance().proxy()
|
configProxy = config.getInstance().proxy()
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||||
retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||||
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
if configProxy.enable:
|
if configProxy.enable:
|
||||||
|
s.verify = config.getInstance().cacert_file()
|
||||||
s.proxies = configProxy.proxies()
|
s.proxies = configProxy.proxies()
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
try:
|
||||||
result = browser.open(url)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
||||||
if not result.ok:
|
result = browser.open(url)
|
||||||
return ''
|
if not result.ok:
|
||||||
form = browser.select_form() if form_select is None else browser.select_form(form_select)
|
return None
|
||||||
if isinstance(fields, dict):
|
form = browser.select_form() if form_select is None else browser.select_form(form_select)
|
||||||
for k, v in fields.items():
|
if isinstance(fields, dict):
|
||||||
browser[k] = v
|
for k, v in fields.items():
|
||||||
response = browser.submit_selected()
|
browser[k] = v
|
||||||
response.encoding = "utf-8"
|
response = browser.submit_selected()
|
||||||
if return_type == "object":
|
|
||||||
return response
|
if return_type == "object":
|
||||||
elif return_type == "content":
|
return response
|
||||||
return response.content
|
elif return_type == "content":
|
||||||
elif return_type == "browser":
|
return response.content
|
||||||
return response, browser
|
elif return_type == "browser":
|
||||||
else:
|
return response, browser
|
||||||
return response.text
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return response.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_by_form() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[-]get_html_by_form() Failed! {e}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||||
|
configProxy = config.getInstance().proxy()
|
||||||
|
session = create_scraper(browser={'custom': ua or G_USER_AGENT,})
|
||||||
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
|
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
||||||
|
if configProxy.enable:
|
||||||
|
session.verify = config.getInstance().cacert_file()
|
||||||
|
session.proxies = configProxy.proxies()
|
||||||
|
try:
|
||||||
|
if isinstance(url, str) and len(url):
|
||||||
|
result = session.get(str(url))
|
||||||
|
else: # 空url参数直接返回可重用scraper对象,无需设置return_type
|
||||||
|
return session
|
||||||
|
if not result.ok:
|
||||||
|
return None
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
elif return_type == "scraper":
|
||||||
|
return result, session
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return result.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_session() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-]get_html_session() failed. {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# def get_javlib_cookie() -> [dict, str]:
|
# def get_javlib_cookie() -> [dict, str]:
|
||||||
@@ -645,3 +736,37 @@ def file_not_exist_or_empty(filepath) -> bool:
|
|||||||
# 日语简单检测
|
# 日语简单检测
|
||||||
def is_japanese(s) -> bool:
|
def is_japanese(s) -> bool:
|
||||||
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))
|
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))
|
||||||
|
|
||||||
|
|
||||||
|
# Usage: python ./ADC_function.py https://cn.bing.com/
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys, timeit
|
||||||
|
from http.client import HTTPConnection
|
||||||
|
def benchmark(t, url):
|
||||||
|
print(f"HTTP GET Benchmark times:{t} url:{url}")
|
||||||
|
tm = timeit.timeit(f"_ = session1.get('{url}')",
|
||||||
|
"from __main__ import get_html_session;session1=get_html_session()",
|
||||||
|
number=t)
|
||||||
|
print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable')
|
||||||
|
tm = timeit.timeit(f"_ = scraper1.get('{url}')",
|
||||||
|
"from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()",
|
||||||
|
number=t)
|
||||||
|
print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable')
|
||||||
|
tm = timeit.timeit(f"_ = browser1.open('{url}')",
|
||||||
|
"from __main__ import get_html_by_browser;browser1=get_html_by_browser()",
|
||||||
|
number=t)
|
||||||
|
print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable')
|
||||||
|
tm = timeit.timeit(f"_ = get_html('{url}')",
|
||||||
|
"from __main__ import get_html",
|
||||||
|
number=t)
|
||||||
|
print(f' *{tm:>10.5f}s get_html()')
|
||||||
|
t = 100
|
||||||
|
#url = "https://www.189.cn/"
|
||||||
|
url = "http://www.chinaunicom.com"
|
||||||
|
HTTPConnection.debuglevel = 1
|
||||||
|
s = get_html_session()
|
||||||
|
_ = s.get(url)
|
||||||
|
HTTPConnection.debuglevel = 0
|
||||||
|
if len(sys.argv)>1:
|
||||||
|
url = sys.argv[1]
|
||||||
|
benchmark(t, url)
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import shutil
|
|||||||
import typing
|
import typing
|
||||||
import urllib3
|
import urllib3
|
||||||
import signal
|
import signal
|
||||||
|
from opencc import OpenCC
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@@ -377,7 +378,7 @@ def rm_empty_folder(path):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def create_data_and_move(file_path: str, zero_op):
|
def create_data_and_move(file_path: str, zero_op, oCC):
|
||||||
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
|
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
|
||||||
debug = config.getInstance().debug()
|
debug = config.getInstance().debug()
|
||||||
n_number = get_number(debug, os.path.basename(file_path))
|
n_number = get_number(debug, os.path.basename(file_path))
|
||||||
@@ -388,7 +389,7 @@ def create_data_and_move(file_path: str, zero_op):
|
|||||||
if zero_op:
|
if zero_op:
|
||||||
return
|
return
|
||||||
if n_number:
|
if n_number:
|
||||||
core_main(file_path, n_number)
|
core_main(file_path, n_number, oCC)
|
||||||
else:
|
else:
|
||||||
print("[-] number empty ERROR")
|
print("[-] number empty ERROR")
|
||||||
moveFailedFolder(file_path)
|
moveFailedFolder(file_path)
|
||||||
@@ -399,7 +400,7 @@ def create_data_and_move(file_path: str, zero_op):
|
|||||||
if zero_op:
|
if zero_op:
|
||||||
return
|
return
|
||||||
if n_number:
|
if n_number:
|
||||||
core_main(file_path, n_number)
|
core_main(file_path, n_number, oCC)
|
||||||
else:
|
else:
|
||||||
raise ValueError("number empty")
|
raise ValueError("number empty")
|
||||||
print("[*]======================================================")
|
print("[*]======================================================")
|
||||||
@@ -413,13 +414,13 @@ def create_data_and_move(file_path: str, zero_op):
|
|||||||
print('[!]', err)
|
print('[!]', err)
|
||||||
|
|
||||||
|
|
||||||
def create_data_and_move_with_custom_number(file_path: str, custom_number):
|
def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
file_name = os.path.basename(file_path)
|
file_name = os.path.basename(file_path)
|
||||||
try:
|
try:
|
||||||
print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
|
print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
|
||||||
if custom_number:
|
if custom_number:
|
||||||
core_main(file_path, custom_number)
|
core_main(file_path, custom_number, oCC)
|
||||||
else:
|
else:
|
||||||
print("[-] number empty ERROR")
|
print("[-] number empty ERROR")
|
||||||
print("[*]======================================================")
|
print("[*]======================================================")
|
||||||
@@ -488,12 +489,21 @@ def main():
|
|||||||
|
|
||||||
create_failed_folder(conf.failed_folder())
|
create_failed_folder(conf.failed_folder())
|
||||||
|
|
||||||
|
# create OpenCC converter
|
||||||
|
ccm = conf.cc_convert_mode()
|
||||||
|
try:
|
||||||
|
oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json')
|
||||||
|
except:
|
||||||
|
# some OS no OpennCC cpython, try opencc-python-reimplemented.
|
||||||
|
# pip uninstall opencc && pip install opencc-python-reimplemented
|
||||||
|
oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t')
|
||||||
|
|
||||||
if not single_file_path == '': #Single File
|
if not single_file_path == '': #Single File
|
||||||
print('[+]==================== Single File =====================')
|
print('[+]==================== Single File =====================')
|
||||||
if custom_number == '':
|
if custom_number == '':
|
||||||
create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)))
|
create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)), oCC)
|
||||||
else:
|
else:
|
||||||
create_data_and_move_with_custom_number(single_file_path, custom_number)
|
create_data_and_move_with_custom_number(single_file_path, custom_number, oCC)
|
||||||
else:
|
else:
|
||||||
folder_path = conf.source_folder()
|
folder_path = conf.source_folder()
|
||||||
if not isinstance(folder_path, str) or folder_path == '':
|
if not isinstance(folder_path, str) or folder_path == '':
|
||||||
@@ -515,7 +525,7 @@ def main():
|
|||||||
count = count + 1
|
count = count + 1
|
||||||
percentage = str(count / int(count_all) * 100)[:4] + '%'
|
percentage = str(count / int(count_all) * 100)[:4] + '%'
|
||||||
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
|
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
|
||||||
create_data_and_move(movie_path, zero_op)
|
create_data_and_move(movie_path, zero_op, oCC)
|
||||||
if count >= stop_count:
|
if count >= stop_count:
|
||||||
print("[!]Stop counter triggered!")
|
print("[!]Stop counter triggered!")
|
||||||
break
|
break
|
||||||
|
|||||||
2
Makefile
2
Makefile
@@ -17,6 +17,8 @@ make:
|
|||||||
|
|
||||||
@echo "[+]Pyinstaller make"
|
@echo "[+]Pyinstaller make"
|
||||||
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
|
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
|
||||||
|
--add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \
|
||||||
|
--add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \
|
||||||
--add-data "Img:Img" \
|
--add-data "Img:Img" \
|
||||||
--add-data "config.ini:." \
|
--add-data "config.ini:." \
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_data_from_json(file_number): # 从JSON返回元数据
|
def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||||
"""
|
"""
|
||||||
iterate through all services and fetch the data
|
iterate through all services and fetch the data
|
||||||
"""
|
"""
|
||||||
@@ -290,6 +290,20 @@ def get_data_from_json(file_number): # 从JSON返回元数据
|
|||||||
if len(t):
|
if len(t):
|
||||||
json_data[translate_value] = special_characters_replacement(t)
|
json_data[translate_value] = special_characters_replacement(t)
|
||||||
|
|
||||||
|
if oCC:
|
||||||
|
cc_vars = conf.cc_convert_vars().split(",")
|
||||||
|
for cc in cc_vars:
|
||||||
|
if cc == "actor":
|
||||||
|
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
|
||||||
|
json_data['actor'] = oCC.convert(json_data['actor'])
|
||||||
|
elif cc == "tag":
|
||||||
|
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
json_data[cc] = oCC.convert(json_data[cc])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
naming_rule=""
|
naming_rule=""
|
||||||
for i in conf.naming_rule().split("+"):
|
for i in conf.naming_rule().split("+"):
|
||||||
if i not in json_data:
|
if i not in json_data:
|
||||||
@@ -314,4 +328,6 @@ def special_characters_replacement(text) -> str:
|
|||||||
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
|
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
|
||||||
replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
|
replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
|
||||||
replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
|
replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
|
||||||
replace('&', '&'))
|
replace('…','…').
|
||||||
|
replace('&', '&')
|
||||||
|
)
|
||||||
|
|||||||
@@ -6,17 +6,16 @@ import re
|
|||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
from WebCrawler.storyline import getStoryline
|
from WebCrawler.storyline import getStoryline
|
||||||
|
|
||||||
|
|
||||||
|
G_SITE = 'https://www.caribbeancom.com'
|
||||||
|
|
||||||
|
|
||||||
def main(number: str) -> json:
|
def main(number: str) -> json:
|
||||||
try:
|
try:
|
||||||
# 因演员图片功能还未使用,为提速暂时注释,改为用get_html()
|
url = f'{G_SITE}/moviepages/{number}/index.html'
|
||||||
#r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
|
result, session = get_html_session(url, return_type='session')
|
||||||
# return_type='browser')
|
htmlcode = result.content.decode('euc-jp')
|
||||||
#if not r.ok:
|
if not result or not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
|
||||||
# raise ValueError("page not found")
|
|
||||||
#htmlcode = str(browser.page)
|
|
||||||
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
|
|
||||||
htmlcode = htmlbyte.decode('euc-jp')
|
|
||||||
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
|
|
||||||
raise ValueError("page not found")
|
raise ValueError("page not found")
|
||||||
|
|
||||||
lx = html.fromstring(htmlcode)
|
lx = html.fromstring(htmlcode)
|
||||||
@@ -32,13 +31,13 @@ def main(number: str) -> json:
|
|||||||
'actor': get_actor(lx),
|
'actor': get_actor(lx),
|
||||||
'release': get_release(lx),
|
'release': get_release(lx),
|
||||||
'number': number,
|
'number': number,
|
||||||
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
|
'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
|
||||||
'tag': get_tag(lx),
|
'tag': get_tag(lx),
|
||||||
'extrafanart': get_extrafanart(lx),
|
'extrafanart': get_extrafanart(lx),
|
||||||
'label': get_series(lx),
|
'label': get_series(lx),
|
||||||
'imagecut': 1,
|
'imagecut': 1,
|
||||||
# 'actor_photo': get_actor_photo(browser),
|
# 'actor_photo': get_actor_photo(lx, session),
|
||||||
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
|
'website': f'{G_SITE}/moviepages/{number}/index.html',
|
||||||
'source': 'carib.py',
|
'source': 'carib.py',
|
||||||
'series': get_series(lx),
|
'series': get_series(lx),
|
||||||
}
|
}
|
||||||
@@ -101,24 +100,25 @@ def get_series(lx: html.HtmlElement) -> str:
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
def get_runtime(lx: html.HtmlElement) -> str:
|
def get_runtime(lx: html.HtmlElement) -> str:
|
||||||
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
||||||
|
|
||||||
def get_actor_photo(browser):
|
def get_actor_photo(lx, session):
|
||||||
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a')
|
htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
|
||||||
|
names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
|
||||||
t = {}
|
t = {}
|
||||||
for a in htmla:
|
for name, a in zip(names, htmla):
|
||||||
if a.text.strip() == '他':
|
if name.strip() == '他':
|
||||||
continue
|
continue
|
||||||
p = {a.text.strip(): a['href']}
|
p = {name.strip(): a.attrib['href']}
|
||||||
t.update(p)
|
t.update(p)
|
||||||
o = {}
|
o = {}
|
||||||
for k, v in t.items():
|
for k, v in t.items():
|
||||||
if '/search_act/' not in v:
|
if '/search_act/' not in v:
|
||||||
continue
|
continue
|
||||||
r = browser.open_relative(v)
|
r = session.get(urljoin(G_SITE, v))
|
||||||
if not r.ok:
|
if not r.ok:
|
||||||
continue
|
continue
|
||||||
html = browser.page.prettify()
|
html = r.text
|
||||||
pos = html.find('.full-bg')
|
pos = html.find('.full-bg')
|
||||||
if pos<0:
|
if pos<0:
|
||||||
continue
|
continue
|
||||||
@@ -126,7 +126,7 @@ def get_actor_photo(browser):
|
|||||||
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
|
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
|
||||||
if not cssBGjpgs or not len(cssBGjpgs[0]):
|
if not cssBGjpgs or not len(cssBGjpgs[0]):
|
||||||
continue
|
continue
|
||||||
p = {k: urljoin(browser.url, cssBGjpgs[0])}
|
p = {k: urljoin(r.url, cssBGjpgs[0])}
|
||||||
o.update(p)
|
o.update(p)
|
||||||
return o
|
return o
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from lxml import etree#need install
|
|||||||
import json
|
import json
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
from WebCrawler.storyline import getStoryline
|
from WebCrawler.storyline import getStoryline
|
||||||
|
import inspect
|
||||||
|
|
||||||
def getActorPhoto(html):
|
def getActorPhoto(html):
|
||||||
actors = html.xpath('//div[@class="star-name"]/a')
|
actors = html.xpath('//div[@class="star-name"]/a')
|
||||||
@@ -60,6 +61,8 @@ def getCID(html):
|
|||||||
result = re.sub('/.*?.jpg','',string)
|
result = re.sub('/.*?.jpg','',string)
|
||||||
return result
|
return result
|
||||||
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||||
|
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
|
||||||
|
return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度
|
||||||
return getStoryline(number,title)
|
return getStoryline(number,title)
|
||||||
def getSeriseJa(html):
|
def getSeriseJa(html):
|
||||||
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
|
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
|
||||||
@@ -115,8 +118,15 @@ def main_uncensored(number):
|
|||||||
def main(number):
|
def main(number):
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
|
url = "https://www." + secrets.choice([
|
||||||
|
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
|
||||||
|
'cdnbus.fun',
|
||||||
|
'dmmbus.fun', 'dmmsee.fun',
|
||||||
|
'fanbus.us',
|
||||||
|
'seedmm.fun',
|
||||||
|
]) + "/"
|
||||||
try:
|
try:
|
||||||
htmlcode = get_html('https://www.fanbus.us/' + number)
|
htmlcode = get_html(url + number)
|
||||||
except:
|
except:
|
||||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
htmlcode = get_html('https://www.javbus.com/' + number)
|
||||||
if "<title>404 Page Not Found" in htmlcode:
|
if "<title>404 Page Not Found" in htmlcode:
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import re
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
import json
|
import json
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
from mechanicalsoup.stateful_browser import StatefulBrowser
|
|
||||||
from WebCrawler.storyline import getStoryline
|
from WebCrawler.storyline import getStoryline
|
||||||
# import io
|
# import io
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
||||||
@@ -30,8 +29,8 @@ def getActor(html):
|
|||||||
idx = idx + 1
|
idx = idx + 1
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def getaphoto(url, browser):
|
def getaphoto(url, session):
|
||||||
html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
|
html_page = session.get(url).text if session is not None else get_html(url)
|
||||||
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
|
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
|
||||||
img_url = img_prether.findall(html_page)
|
img_url = img_prether.findall(html_page)
|
||||||
if img_url:
|
if img_url:
|
||||||
@@ -39,7 +38,7 @@ def getaphoto(url, browser):
|
|||||||
else:
|
else:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
|
def getActorPhoto(html, javdb_site, session):
|
||||||
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
|
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
|
||||||
if not actorall:
|
if not actorall:
|
||||||
return {}
|
return {}
|
||||||
@@ -47,7 +46,7 @@ def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
|
|||||||
actor_photo = {}
|
actor_photo = {}
|
||||||
for i in actorall:
|
for i in actorall:
|
||||||
if i.text in a:
|
if i.text in a:
|
||||||
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
|
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session)
|
||||||
return actor_photo
|
return actor_photo
|
||||||
|
|
||||||
def getStudio(a, html):
|
def getStudio(a, html):
|
||||||
@@ -178,15 +177,6 @@ def getDirector(html):
|
|||||||
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除
|
|
||||||
try:
|
|
||||||
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
|
|
||||||
from WebCrawler.airav import getOutline as airav_getOutline
|
|
||||||
result = airav_getOutline(htmlcode)
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return ''
|
|
||||||
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
def getOutline(number, title): #获取剧情介绍 多进程并发查询
|
||||||
return getStoryline(number,title)
|
return getStoryline(number,title)
|
||||||
def getSeries(html):
|
def getSeries(html):
|
||||||
@@ -224,15 +214,22 @@ def main(number):
|
|||||||
javdb_site = secrets.choice(javdb_sites)
|
javdb_site = secrets.choice(javdb_sites)
|
||||||
if debug:
|
if debug:
|
||||||
print(f'[!]javdb:select site {javdb_site}')
|
print(f'[!]javdb:select site {javdb_site}')
|
||||||
browser = None
|
session = None
|
||||||
|
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
||||||
try:
|
try:
|
||||||
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
if debug:
|
||||||
res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser')
|
raise # try get_html_by_scraper() branch
|
||||||
if not res.ok:
|
res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
|
||||||
|
if not res:
|
||||||
raise
|
raise
|
||||||
query_result = res.text
|
query_result = res.text
|
||||||
except:
|
except:
|
||||||
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies)
|
res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper')
|
||||||
|
if not res:
|
||||||
|
raise ValueError('page not found')
|
||||||
|
query_result = res.text
|
||||||
|
if session is None:
|
||||||
|
raise ValueError('page not found')
|
||||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
||||||
# javdb sometime returns multiple results,
|
# javdb sometime returns multiple results,
|
||||||
# and the first elememt maybe not the one we are looking for
|
# and the first elememt maybe not the one we are looking for
|
||||||
@@ -251,13 +248,12 @@ def main(number):
|
|||||||
raise ValueError("number not found")
|
raise ValueError("number not found")
|
||||||
correct_url = urls[0]
|
correct_url = urls[0]
|
||||||
try:
|
try:
|
||||||
if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive
|
# get faster benefit from http keep-alive
|
||||||
detail_page = browser.open_relative(correct_url).text
|
javdb_detail_url = urljoin(res.url, correct_url)
|
||||||
else:
|
detail_page = session.get(javdb_detail_url).text
|
||||||
javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
|
|
||||||
detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
|
|
||||||
except:
|
except:
|
||||||
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
|
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
|
||||||
|
session = None
|
||||||
|
|
||||||
# etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用
|
# etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用
|
||||||
lx = etree.fromstring(detail_page, etree.HTMLParser())
|
lx = etree.fromstring(detail_page, etree.HTMLParser())
|
||||||
@@ -303,8 +299,8 @@ def main(number):
|
|||||||
'tag': getTag(lx),
|
'tag': getTag(lx),
|
||||||
'label': getLabel(lx),
|
'label': getLabel(lx),
|
||||||
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
|
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
|
||||||
# 'actor_photo': getActorPhoto(lx, javdb_site, browser),
|
# 'actor_photo': getActorPhoto(lx, javdb_site, session),
|
||||||
'website': 'https://javdb.com' + correct_url,
|
'website': urljoin('https://javdb.com', correct_url),
|
||||||
'source': 'javdb.py',
|
'source': 'javdb.py',
|
||||||
'series': getSeries(lx),
|
'series': getSeries(lx),
|
||||||
|
|
||||||
@@ -318,7 +314,7 @@ def main(number):
|
|||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if config.getInstance().debug():
|
if debug:
|
||||||
print(e)
|
print(e)
|
||||||
dic = {"title": ""}
|
dic = {"title": ""}
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||||
@@ -333,12 +329,12 @@ if __name__ == "__main__":
|
|||||||
# print(main('BANK-022'))
|
# print(main('BANK-022'))
|
||||||
# print(main('070116-197'))
|
# print(main('070116-197'))
|
||||||
# print(main('093021_539')) # 没有剧照 片商pacopacomama
|
# print(main('093021_539')) # 没有剧照 片商pacopacomama
|
||||||
# print(main('FC2-2278260'))
|
print(main('FC2-2278260'))
|
||||||
# print(main('FC2-735670'))
|
# print(main('FC2-735670'))
|
||||||
# print(main('FC2-1174949')) # not found
|
# print(main('FC2-1174949')) # not found
|
||||||
print(main('MVSD-439'))
|
print(main('MVSD-439'))
|
||||||
# print(main('EHM0001')) # not found
|
# print(main('EHM0001')) # not found
|
||||||
# print(main('FC2-2314275'))
|
print(main('FC2-2314275'))
|
||||||
# print(main('EBOD-646'))
|
# print(main('EBOD-646'))
|
||||||
# print(main('LOVE-262'))
|
# print(main('LOVE-262'))
|
||||||
print(main('ABP-890'))
|
print(main('ABP-890'))
|
||||||
|
|||||||
@@ -4,13 +4,14 @@ import re
|
|||||||
import json
|
import json
|
||||||
import builtins
|
import builtins
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
|
from lxml.html import fromstring
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
from multiprocessing.dummy import Pool as ThreadPool
|
from multiprocessing.dummy import Pool as ThreadPool
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from unicodedata import category
|
from unicodedata import category
|
||||||
from number_parser import is_uncensored
|
from number_parser import is_uncensored
|
||||||
|
|
||||||
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"}
|
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
|
||||||
|
|
||||||
G_mode_txt = ('顺序执行','线程池','进程池')
|
G_mode_txt = ('顺序执行','线程池','进程池')
|
||||||
|
|
||||||
@@ -27,6 +28,8 @@ class noThread(object):
|
|||||||
def getStoryline(number, title, sites: list=None):
|
def getStoryline(number, title, sites: list=None):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
|
if not conf.is_storyline():
|
||||||
|
return ''
|
||||||
debug = conf.debug() or conf.storyline_show() == 2
|
debug = conf.debug() or conf.storyline_show() == 2
|
||||||
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
|
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
|
||||||
if is_uncensored(number):
|
if is_uncensored(number):
|
||||||
@@ -49,82 +52,87 @@ def getStoryline(number, title, sites: list=None):
|
|||||||
run_mode = conf.storyline_mode()
|
run_mode = conf.storyline_mode()
|
||||||
assert run_mode in (0,1,2)
|
assert run_mode in (0,1,2)
|
||||||
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
|
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
|
||||||
result = pool.map(getStoryline_mp, mp_args)
|
results = pool.map(getStoryline_mp, mp_args)
|
||||||
if not debug and conf.storyline_show() == 0:
|
if not debug and conf.storyline_show() == 0:
|
||||||
for value in result:
|
for value in results:
|
||||||
if isinstance(value, str) and len(value):
|
if isinstance(value, str) and len(value):
|
||||||
return value
|
return value
|
||||||
return ''
|
return ''
|
||||||
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
|
# 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示
|
||||||
cnt = len(apply_sites)
|
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||||
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
|
||||||
first = True
|
first = True
|
||||||
sel = ''
|
sel = ''
|
||||||
for i in range(cnt):
|
for site, desc in zip(apply_sites, results):
|
||||||
sl = len(result[i])if isinstance(result[i], str) else 0
|
sl = len(desc) if isinstance(desc, str) else 0
|
||||||
if sl and first:
|
if sl and first:
|
||||||
s += f',[选中{apply_sites[i]}字数:{sl}]'
|
s += f',[选中{site}字数:{sl}]'
|
||||||
first = False
|
first = False
|
||||||
sel = result[i]
|
sel = desc
|
||||||
elif sl:
|
elif sl:
|
||||||
s += f',{apply_sites[i]}字数:{sl}'
|
s += f',{site}字数:{sl}'
|
||||||
else:
|
else:
|
||||||
s += f',{apply_sites[i]}:空'
|
s += f',{site}:空'
|
||||||
print(s)
|
print(s)
|
||||||
return sel
|
return sel
|
||||||
|
|
||||||
|
|
||||||
def getStoryline_mp(args):
|
def getStoryline_mp(args):
|
||||||
return _getStoryline_mp(*args)
|
def _inner(site, number, title, debug):
|
||||||
|
start_time = time.time()
|
||||||
|
storyline = None
|
||||||
# 注:新进程的print()不会写入日志中,将来调试修复失效数据源需直接查看标准输出,issue信息需截图屏幕
|
if not isinstance(site, str):
|
||||||
def _getStoryline_mp(site, number, title, debug):
|
return storyline
|
||||||
start_time = time.time()
|
elif site == "airavwiki":
|
||||||
storyline = None
|
storyline = getStoryline_airavwiki(number, debug)
|
||||||
if not isinstance(site, str):
|
elif site == "airav":
|
||||||
|
storyline = getStoryline_airav(number, debug)
|
||||||
|
elif site == "avno1":
|
||||||
|
storyline = getStoryline_avno1(number, debug)
|
||||||
|
elif site == "xcity":
|
||||||
|
storyline = getStoryline_xcity(number, debug)
|
||||||
|
elif site == "amazon":
|
||||||
|
storyline = getStoryline_amazon(title, number, debug)
|
||||||
|
elif site == "58avgo":
|
||||||
|
storyline = getStoryline_58avgo(number, debug)
|
||||||
|
if not debug:
|
||||||
|
return storyline
|
||||||
|
# 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响
|
||||||
|
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
|
||||||
|
site,
|
||||||
|
time.time() - start_time,
|
||||||
|
time.strftime("%H:%M:%S"),
|
||||||
|
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
|
||||||
|
)
|
||||||
return storyline
|
return storyline
|
||||||
elif site == "airav":
|
return _inner(*args)
|
||||||
storyline = getStoryline_airav(number, debug)
|
|
||||||
elif site == "avno1":
|
|
||||||
storyline = getStoryline_avno1(number, debug)
|
|
||||||
elif site == "xcity":
|
|
||||||
storyline = getStoryline_xcity(number, debug)
|
|
||||||
elif site == "amazon":
|
|
||||||
storyline = getStoryline_amazon(title, number, debug)
|
|
||||||
elif site == "58avgo":
|
|
||||||
storyline = getStoryline_58avgo(number, debug)
|
|
||||||
if not debug:
|
|
||||||
return storyline
|
|
||||||
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
|
|
||||||
site,
|
|
||||||
time.time() - start_time,
|
|
||||||
time.strftime("%H:%M:%S"),
|
|
||||||
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
|
|
||||||
)
|
|
||||||
return storyline
|
|
||||||
|
|
||||||
|
|
||||||
def getStoryline_airav(number, debug):
|
def getStoryline_airav(number, debug):
|
||||||
try:
|
try:
|
||||||
number_up = number
|
|
||||||
site = secrets.choice(('airav.cc','airav4.club'))
|
site = secrets.choice(('airav.cc','airav4.club'))
|
||||||
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
|
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
|
||||||
res, browser = get_html_by_browser(url, return_type='browser')
|
res, session = get_html_session(url, return_type='session')
|
||||||
if not res.ok:
|
if not res:
|
||||||
raise ValueError(f"get_html_by_browser('{url}') failed")
|
raise ValueError(f"get_html_by_session('{url}') failed")
|
||||||
avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div')
|
lx = fromstring(res.text)
|
||||||
if number_up not in avs.select_one('a > h3').text.upper():
|
urls = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/@href')
|
||||||
|
txts = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/h3[@class="one_name ga_name"]/text()')
|
||||||
|
detail_url = None
|
||||||
|
for txt, url in zip(txts, urls):
|
||||||
|
if re.search(number, txt, re.I):
|
||||||
|
detail_url = urljoin(res.url, url)
|
||||||
|
break
|
||||||
|
if detail_url is None:
|
||||||
raise ValueError("number not found")
|
raise ValueError("number not found")
|
||||||
detail_url = avs.select_one('a')['href']
|
res = session.get(detail_url)
|
||||||
res = browser.open_relative(detail_url)
|
|
||||||
if not res.ok:
|
if not res.ok:
|
||||||
raise ValueError(f"browser.open_relative('{detail_url}') failed")
|
raise ValueError(f"session.get('{detail_url}') failed")
|
||||||
t = browser.page.select_one('head > title').text
|
lx = fromstring(res.text)
|
||||||
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
|
t = str(lx.xpath('/html/head/title/text()')[0]).strip()
|
||||||
if number.upper() != airav_number:
|
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0])
|
||||||
|
if not re.search(number, airav_number, re.I):
|
||||||
raise ValueError(f"page number ->[{airav_number}] not match")
|
raise ValueError(f"page number ->[{airav_number}] not match")
|
||||||
desc = browser.page.select_one('li.introduction > span').text.strip()
|
desc = str(lx.xpath('//span[@id="ContentPlaceHolder1_Label2"]/text()')[0]).strip()
|
||||||
return desc
|
return desc
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if debug:
|
if debug:
|
||||||
@@ -133,6 +141,43 @@ def getStoryline_airav(number, debug):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_airavwiki(number, debug):
|
||||||
|
try:
|
||||||
|
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
|
||||||
|
url = f'https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search={kwd}'
|
||||||
|
result, session = get_html_session(url, return_type='session')
|
||||||
|
if not result:
|
||||||
|
raise ValueError(f"get_html_session('{url}','{number}') failed")
|
||||||
|
j = json.loads(result.content)
|
||||||
|
if int(j.get('count')) == 0:
|
||||||
|
raise ValueError("number not found")
|
||||||
|
link = None
|
||||||
|
for r in j["result"]:
|
||||||
|
n = r['barcode']
|
||||||
|
if re.search(number, n, re.I):
|
||||||
|
link = urljoin(result.url, f'/api/video/barcode/{n}?lng=zh-TW')
|
||||||
|
break
|
||||||
|
if link is None:
|
||||||
|
raise ValueError("number not found")
|
||||||
|
result = session.get(link)
|
||||||
|
if not result.ok or not re.search(number, result.url, re.I):
|
||||||
|
raise ValueError("detail page not found")
|
||||||
|
j = json.loads(result.content)
|
||||||
|
if int(j.get('count')) != 1:
|
||||||
|
raise ValueError("number not found")
|
||||||
|
detail_number = j["result"]['barcode']
|
||||||
|
if not re.search(number, detail_number, re.I):
|
||||||
|
raise ValueError("detail page number not match, got ->[{detail_number}]")
|
||||||
|
desc = j["result"]['description']
|
||||||
|
return desc
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
if debug:
|
||||||
|
print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].")
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def getStoryline_58avgo(number, debug):
|
def getStoryline_58avgo(number, debug):
|
||||||
try:
|
try:
|
||||||
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
|
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
|
||||||
@@ -143,27 +188,27 @@ def getStoryline_58avgo(number, debug):
|
|||||||
result, browser = get_html_by_form(url,
|
result, browser = get_html_by_form(url,
|
||||||
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
|
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
|
||||||
return_type = 'browser')
|
return_type = 'browser')
|
||||||
if not result.ok:
|
if not result:
|
||||||
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
|
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
|
||||||
if f'searchresults.aspx?Search={kwd}' not in browser.url:
|
if f'searchresults.aspx?Search={kwd}' not in browser.url:
|
||||||
raise ValueError("number not found")
|
raise ValueError("number not found")
|
||||||
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
|
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
|
||||||
link = None
|
link = None
|
||||||
for i in range(len(s)):
|
for a in s:
|
||||||
title = s[i].h3.text.strip()
|
title = a.h3.text.strip()
|
||||||
if re.search(number, title, re.I):
|
if re.search(number, title, re.I):
|
||||||
link = s[i]
|
link = a
|
||||||
break
|
break
|
||||||
if link is None:
|
if link is None:
|
||||||
raise ValueError("number not found")
|
raise ValueError("number not found")
|
||||||
result = browser.follow_link(link)
|
result = browser.follow_link(link)
|
||||||
if not result.ok or 'playon.aspx' not in browser.url:
|
if not result.ok or 'playon.aspx' not in browser.url:
|
||||||
raise ValueError("detail page not found")
|
raise ValueError("detail page not found")
|
||||||
title = browser.page.select('head > title')[0].text.strip()
|
title = browser.page.select_one('head > title').text.strip()
|
||||||
detail_number = str(re.findall('\[(.*?)]', title)[0])
|
detail_number = str(re.findall('\[(.*?)]', title)[0])
|
||||||
if not re.search(number, detail_number, re.I):
|
if not re.search(number, detail_number, re.I):
|
||||||
raise ValueError("detail page number not match, got ->[{detail_number}]")
|
raise ValueError("detail page number not match, got ->[{detail_number}]")
|
||||||
return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip()
|
return browser.page.select_one('#ContentPlaceHolder1_Label2').text.strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if debug:
|
if debug:
|
||||||
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
|
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
|
||||||
@@ -172,6 +217,29 @@ def getStoryline_58avgo(number, debug):
|
|||||||
|
|
||||||
|
|
||||||
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
|
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||||
|
try:
|
||||||
|
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
|
||||||
|
'hotav.biz','iqq2.xyz','javhq.tv',
|
||||||
|
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
|
||||||
|
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
|
||||||
|
lx = fromstring(get_html_by_scraper(url))
|
||||||
|
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
|
||||||
|
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
|
||||||
|
if not descs or not len(descs):
|
||||||
|
raise ValueError(f"number not found")
|
||||||
|
for title, desc in zip(titles, descs):
|
||||||
|
page_number = title[title.rfind(' '):].strip()
|
||||||
|
if re.search(number, page_number, re.I):
|
||||||
|
return desc.strip()
|
||||||
|
raise ValueError(f"page number ->[{page_number}] not match")
|
||||||
|
except Exception as e:
|
||||||
|
if debug:
|
||||||
|
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
|
||||||
try:
|
try:
|
||||||
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
|
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
|
||||||
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
|
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
|
||||||
@@ -181,14 +249,14 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
|
|||||||
form_select='div.wrapper > div.header > div.search > form',
|
form_select='div.wrapper > div.header > div.search > form',
|
||||||
fields = {'kw' : number},
|
fields = {'kw' : number},
|
||||||
return_type = 'browser')
|
return_type = 'browser')
|
||||||
if not result.ok:
|
if not result:
|
||||||
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
|
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
|
||||||
s = browser.page.select('div.type_movie > div > ul > li > div')
|
s = browser.page.select('div.type_movie > div > ul > li > div')
|
||||||
for i in range(len(s)):
|
for div in s:
|
||||||
title = s[i].a.h3.text.strip()
|
title = div.a.h3.text.strip()
|
||||||
page_number = title[title.rfind(' '):].strip()
|
page_number = title[title.rfind(' '):].strip()
|
||||||
if re.search(number, page_number, re.I):
|
if re.search(number, page_number, re.I):
|
||||||
return s[i]['data-description'].strip()
|
return div['data-description'].strip()
|
||||||
raise ValueError(f"page number ->[{page_number}] not match")
|
raise ValueError(f"page number ->[{page_number}] not match")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if debug:
|
if debug:
|
||||||
@@ -221,41 +289,45 @@ def getStoryline_amazon(q_title, number, debug):
|
|||||||
if not isinstance(q_title, str) or not len(q_title):
|
if not isinstance(q_title, str) or not len(q_title):
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
amazon_cookie, _ = load_cookies('amazon.json')
|
cookie, cookies_filepath = load_cookies('amazon.json')
|
||||||
cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
|
|
||||||
url = "https://www.amazon.co.jp/s?k=" + q_title
|
url = "https://www.amazon.co.jp/s?k=" + q_title
|
||||||
res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
|
res, session = get_html_session(url, cookies=cookie, return_type='session')
|
||||||
if not res.ok:
|
if not res:
|
||||||
raise ValueError("get_html_by_browser() failed")
|
raise ValueError("get_html_session() failed")
|
||||||
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
|
lx = fromstring(res.text)
|
||||||
if isinstance(lks, list) and len(lks):
|
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
|
||||||
browser.follow_link(lks[0])
|
if len(lks) and lks[0].startswith('/'):
|
||||||
|
res = session.get(urljoin(res.url, lks[0]))
|
||||||
cookie = None
|
cookie = None
|
||||||
html = etree.fromstring(str(browser.page), etree.HTMLParser())
|
lx = fromstring(res.text)
|
||||||
titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
|
titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
|
||||||
urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
|
urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
|
||||||
if not len(urls) or len(urls) != len(titles):
|
if not len(urls) or len(urls) != len(titles):
|
||||||
raise ValueError("titles not found")
|
raise ValueError("titles not found")
|
||||||
idx = amazon_select_one(titles, q_title, number, debug)
|
idx = amazon_select_one(titles, q_title, number, debug)
|
||||||
if not isinstance(idx, int) or idx < 0:
|
if not isinstance(idx, int) or idx < 0:
|
||||||
raise ValueError("title and number not found")
|
raise ValueError("title and number not found")
|
||||||
furl = urls[idx]
|
furl = urljoin(res.url, urls[idx])
|
||||||
r = browser.open_relative(furl)
|
res = session.get(furl)
|
||||||
if not r.ok:
|
if not res.ok:
|
||||||
raise ValueError("browser.open_relative()) failed.")
|
raise ValueError("browser.open_relative()) failed.")
|
||||||
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
|
lx = fromstring(res.text)
|
||||||
if isinstance(lks, list) and len(lks):
|
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
|
||||||
browser.follow_link(lks[0])
|
if len(lks) and lks[0].startswith('/'):
|
||||||
|
res = session.get(urljoin(res.url, lks[0]))
|
||||||
cookie = None
|
cookie = None
|
||||||
|
lx = fromstring(res.text)
|
||||||
ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
|
div = lx.xpath('//*[@id="productDescription"]')[0]
|
||||||
ama_t = re.sub(r'審査番号:\d+', '', ama_t)
|
ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)])
|
||||||
|
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
|
||||||
|
|
||||||
if cookie is None:
|
if cookie is None:
|
||||||
# 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
|
# 删除无效cookies,无论是用户创建还是自动创建,以避免持续故障
|
||||||
|
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
|
||||||
|
# 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
|
||||||
ama_save = Path.home() / ".local/share/avdc/amazon.json"
|
ama_save = Path.home() / ".local/share/avdc/amazon.json"
|
||||||
ama_save.parent.mkdir(parents=True, exist_ok=True)
|
ama_save.parent.mkdir(parents=True, exist_ok=True)
|
||||||
ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
|
ama_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
|
||||||
|
|
||||||
return ama_t
|
return ama_t
|
||||||
|
|
||||||
@@ -270,32 +342,31 @@ def amazon_select_one(a_titles, q_title, number, debug):
|
|||||||
sel = -1
|
sel = -1
|
||||||
ratio = 0
|
ratio = 0
|
||||||
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
|
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
|
||||||
for loc in range(len(a_titles)):
|
for tloc, title in enumerate(a_titles):
|
||||||
t = a_titles[loc]
|
if re.search(number, title, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
|
||||||
if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
|
return tloc
|
||||||
return loc
|
if not re.search('DVD|Blu-ray', title, re.I):
|
||||||
if not re.search('DVD|Blu-ray', t, re.I):
|
|
||||||
continue
|
continue
|
||||||
ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
|
ama_t = str(re.sub('DVD|Blu-ray', "", title, re.I))
|
||||||
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
|
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
|
||||||
findlen = 0
|
findlen = 0
|
||||||
lastpos = -1
|
lastpos = -1
|
||||||
cnt = len(ama_t)
|
for cloc, char in reversed(tuple(enumerate(ama_t))):
|
||||||
for c in reversed(ama_t):
|
pos = que_t.rfind(char)
|
||||||
cnt -= 1
|
|
||||||
pos = que_t.rfind(c)
|
|
||||||
if lastpos >= 0:
|
if lastpos >= 0:
|
||||||
pos_near = que_t[:lastpos].rfind(c)
|
pos_near = que_t[:lastpos].rfind(char)
|
||||||
if pos_near < 0:
|
if pos_near < 0:
|
||||||
findlen = 0
|
findlen = 0
|
||||||
lastpos = -1
|
lastpos = -1
|
||||||
ama_t = ama_t[:cnt+1]
|
ama_t = ama_t[:cloc+1]
|
||||||
else:
|
else:
|
||||||
pos = pos_near
|
pos = pos_near
|
||||||
if pos < 0:
|
if pos < 0:
|
||||||
if category(c) == 'Nd':
|
if category(char) == 'Nd':
|
||||||
return -1
|
return -1
|
||||||
ama_t = ama_t[:cnt]
|
if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U):
|
||||||
|
return -1
|
||||||
|
ama_t = ama_t[:cloc]
|
||||||
findlen = 0
|
findlen = 0
|
||||||
lastpos = -1
|
lastpos = -1
|
||||||
continue
|
continue
|
||||||
@@ -311,7 +382,7 @@ def amazon_select_one(a_titles, q_title, number, debug):
|
|||||||
return -1
|
return -1
|
||||||
r = SequenceMatcher(None, ama_t, que_t).ratio()
|
r = SequenceMatcher(None, ama_t, que_t).ratio()
|
||||||
if r > ratio:
|
if r > ratio:
|
||||||
sel = loc
|
sel = tloc
|
||||||
ratio = r
|
ratio = r
|
||||||
save_t_ = ama_t
|
save_t_ = ama_t
|
||||||
if ratio > 0.999:
|
if ratio > 0.999:
|
||||||
|
|||||||
19
config.ini
19
config.ini
@@ -1,4 +1,4 @@
|
|||||||
# 详细教程请看
|
# 详细教程请看
|
||||||
# - https://github.com/yoshiko2/AV_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini
|
# - https://github.com/yoshiko2/AV_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini
|
||||||
[common]
|
[common]
|
||||||
main_mode=1
|
main_mode=1
|
||||||
@@ -83,24 +83,29 @@ water=2
|
|||||||
|
|
||||||
; 剧照
|
; 剧照
|
||||||
[extrafanart]
|
[extrafanart]
|
||||||
switch=0
|
switch=1
|
||||||
parallel_download=5
|
parallel_download=5
|
||||||
extrafanart_folder=extrafanart
|
extrafanart_folder=extrafanart
|
||||||
|
|
||||||
; 剧情简介
|
; 剧情简介
|
||||||
[storyline]
|
[storyline]
|
||||||
|
switch=1
|
||||||
; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的
|
; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的
|
||||||
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
|
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
|
||||||
; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者
|
; 其中airavwiki airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1 airavwiki 有码无码都能查,
|
||||||
; 流出破解马赛克的影片(此功能没使用)。
|
; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。
|
||||||
; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
|
; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
|
||||||
; 设置成不查询可大幅提高刮削速度。
|
; 设置成不查询可大幅提高刮削速度。
|
||||||
; site=
|
; site=
|
||||||
site=3:avno1
|
site=1:avno1,4:airavwiki
|
||||||
censored_site=1:airav,4:xcity,5:amazon
|
censored_site=2:airav,5:xcity,6:amazon
|
||||||
uncensored_site=2:58avgo
|
uncensored_site=3:58avgo
|
||||||
; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
|
; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
|
||||||
run_mode=1
|
run_mode=1
|
||||||
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因
|
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因
|
||||||
show_result=0
|
show_result=0
|
||||||
|
|
||||||
|
; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁
|
||||||
|
[cc_convert]
|
||||||
|
mode=1
|
||||||
|
vars=actor,director,label,outline,series,studio,tag,title
|
||||||
|
|||||||
38
config.py
38
config.py
@@ -246,23 +246,29 @@ class Config:
|
|||||||
def debug(self) -> bool:
|
def debug(self) -> bool:
|
||||||
return self.getboolean_override("debug_mode", "switch")
|
return self.getboolean_override("debug_mode", "switch")
|
||||||
|
|
||||||
|
def is_storyline(self) -> bool:
|
||||||
|
try:
|
||||||
|
return self.conf.getboolean("storyline", "switch")
|
||||||
|
except:
|
||||||
|
return True
|
||||||
|
|
||||||
def storyline_site(self) -> str:
|
def storyline_site(self) -> str:
|
||||||
try:
|
try:
|
||||||
return self.conf.get("storyline", "site")
|
return self.conf.get("storyline", "site")
|
||||||
except:
|
except:
|
||||||
return "avno1"
|
return "1:avno1,4:airavwiki"
|
||||||
|
|
||||||
def storyline_censored_site(self) -> str:
|
def storyline_censored_site(self) -> str:
|
||||||
try:
|
try:
|
||||||
return self.conf.get("storyline", "censored_site")
|
return self.conf.get("storyline", "censored_site")
|
||||||
except:
|
except:
|
||||||
return "airav,xcity,amazon"
|
return "2:airav,5:xcity,6:amazon"
|
||||||
|
|
||||||
def storyline_uncensored_site(self) -> str:
|
def storyline_uncensored_site(self) -> str:
|
||||||
try:
|
try:
|
||||||
return self.conf.get("storyline", "uncensored_site")
|
return self.conf.get("storyline", "uncensored_site")
|
||||||
except:
|
except:
|
||||||
return "58avgo"
|
return "3:58avgo"
|
||||||
|
|
||||||
def storyline_show(self) -> int:
|
def storyline_show(self) -> int:
|
||||||
try:
|
try:
|
||||||
@@ -278,6 +284,19 @@ class Config:
|
|||||||
except:
|
except:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
def cc_convert_mode(self) -> int:
|
||||||
|
try:
|
||||||
|
v = self.conf.getint("cc_convert", "mode")
|
||||||
|
return v if v in (0,1,2) else 2 if v > 2 else 0
|
||||||
|
except:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def cc_convert_vars(self) -> str:
|
||||||
|
try:
|
||||||
|
return self.conf.get("cc_convert", "vars")
|
||||||
|
except:
|
||||||
|
return "actor,director,label,outline,series,studio,tag,title"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _exit(sec: str) -> None:
|
def _exit(sec: str) -> None:
|
||||||
print("[-] Read config error! Please check the {} section in config.ini", sec)
|
print("[-] Read config error! Please check the {} section in config.ini", sec)
|
||||||
@@ -374,11 +393,18 @@ class Config:
|
|||||||
|
|
||||||
sec14 = "storyline"
|
sec14 = "storyline"
|
||||||
conf.add_section(sec14)
|
conf.add_section(sec14)
|
||||||
conf.set(sec14, "site", "avno1")
|
conf.set(sec14, "switch", 1)
|
||||||
conf.set(sec14, "censored_site", "airav,xcity,amazon")
|
conf.set(sec14, "site", "1:avno1,4:airavwiki")
|
||||||
conf.set(sec14, "uncensored_site", "58avgo")
|
conf.set(sec14, "censored_site", "2:airav,5:xcity,6:amazon")
|
||||||
|
conf.set(sec14, "uncensored_site", "3:58avgo")
|
||||||
conf.set(sec14, "show_result", 0)
|
conf.set(sec14, "show_result", 0)
|
||||||
conf.set(sec14, "run_mode", 1)
|
conf.set(sec14, "run_mode", 1)
|
||||||
|
conf.set(sec14, "cc_convert", 1)
|
||||||
|
|
||||||
|
sec15 = "cc_convert"
|
||||||
|
conf.add_section(sec15)
|
||||||
|
conf.set(sec15, "mode", 1)
|
||||||
|
conf.set(sec15, "vars", "actor,director,label,outline,series,studio,tag,title")
|
||||||
|
|
||||||
return conf
|
return conf
|
||||||
|
|
||||||
|
|||||||
32
core.py
32
core.py
@@ -217,14 +217,12 @@ def extrafanart_download_one_by_one(data, path, filepath):
|
|||||||
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
|
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
|
||||||
|
|
||||||
def download_one_file(args):
|
def download_one_file(args):
|
||||||
return _download_one_file(*args)
|
def _inner(url: str, save_path: Path):
|
||||||
|
filebytes = get_html(url, return_type='content')
|
||||||
def _download_one_file(url: str, save_path: Path):
|
if isinstance(filebytes, bytes) and len(filebytes):
|
||||||
filebytes = get_html(url, return_type='content')
|
if len(filebytes) == save_path.open('wb').write(filebytes):
|
||||||
if isinstance(filebytes, bytes) and len(filebytes):
|
return str(save_path)
|
||||||
if len(filebytes) == save_path.open('wb').write(filebytes):
|
return _inner(*args)
|
||||||
return str(save_path)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def extrafanart_download_threadpool(url_list, save_dir, number):
|
def extrafanart_download_threadpool(url_list, save_dir, number):
|
||||||
tm_start = time.perf_counter()
|
tm_start = time.perf_counter()
|
||||||
@@ -232,11 +230,11 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
|
|||||||
extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
|
extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
|
||||||
download_only_missing_images = conf.download_only_missing_images()
|
download_only_missing_images = conf.download_only_missing_images()
|
||||||
mp_args = []
|
mp_args = []
|
||||||
for i in range(len(url_list)):
|
for i, url in enumerate(url_list, start=1):
|
||||||
jpg_fullpath = extrafanart_dir / f'extrafanart-{i+1}.jpg'
|
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
|
||||||
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
|
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
|
||||||
continue
|
continue
|
||||||
mp_args.append((url_list[i], jpg_fullpath))
|
mp_args.append((url, jpg_fullpath))
|
||||||
if not len(mp_args):
|
if not len(mp_args):
|
||||||
return
|
return
|
||||||
extrafanart_dir.mkdir(parents=True, exist_ok=True)
|
extrafanart_dir.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -246,11 +244,11 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
|
|||||||
with ThreadPoolExecutor(parallel) as pool:
|
with ThreadPoolExecutor(parallel) as pool:
|
||||||
result = list(pool.map(download_one_file, mp_args))
|
result = list(pool.map(download_one_file, mp_args))
|
||||||
failed = 0
|
failed = 0
|
||||||
for i in range(len(result)):
|
for i, r in enumerate(result, start=1):
|
||||||
if not result[i]:
|
if not r:
|
||||||
print(f'[-]Extrafanart {i+1} for [{number}] download failed!')
|
|
||||||
failed += 1
|
failed += 1
|
||||||
if not all(result): # 非致命错误,电影不移入失败文件夹,将来可以用模式3补齐
|
print(f'[-]Extrafanart {i} for [{number}] download failed!')
|
||||||
|
if failed: # 非致命错误,电影不移入失败文件夹,将来可以用模式3补齐
|
||||||
print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.")
|
print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.")
|
||||||
else:
|
else:
|
||||||
print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'")
|
print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'")
|
||||||
@@ -574,7 +572,7 @@ def debug_print(data: json):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def core_main(file_path, number_th):
|
def core_main(file_path, number_th, oCC):
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
# =======================================================================初始化所需变量
|
# =======================================================================初始化所需变量
|
||||||
multi_part = 0
|
multi_part = 0
|
||||||
@@ -589,7 +587,7 @@ def core_main(file_path, number_th):
|
|||||||
# 下面被注释的变量不需要
|
# 下面被注释的变量不需要
|
||||||
#rootpath= os.getcwd
|
#rootpath= os.getcwd
|
||||||
number = number_th
|
number = number_th
|
||||||
json_data = get_data_from_json(number) # 定义番号
|
json_data = get_data_from_json(number, oCC) # 定义番号
|
||||||
|
|
||||||
# Return if blank dict returned (data not found)
|
# Return if blank dict returned (data not found)
|
||||||
if not json_data:
|
if not json_data:
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
|
||||||
|
|
||||||
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
|
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
|
||||||
|
$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1)
|
||||||
|
|
||||||
mkdir build
|
mkdir build
|
||||||
mkdir __pycache__
|
mkdir __pycache__
|
||||||
@@ -10,6 +11,7 @@ pyinstaller --onefile AV_Data_Capture.py `
|
|||||||
--hidden-import ADC_function.py `
|
--hidden-import ADC_function.py `
|
||||||
--hidden-import core.py `
|
--hidden-import core.py `
|
||||||
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
|
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
|
||||||
|
--add-data "$OPENCC_PATH;opencc" `
|
||||||
--add-data "Img;Img" `
|
--add-data "Img;Img" `
|
||||||
--add-data "config.ini;." `
|
--add-data "config.ini;." `
|
||||||
|
|
||||||
|
|||||||
@@ -8,3 +8,4 @@ pysocks==1.7.1
|
|||||||
urllib3==1.24.3
|
urllib3==1.24.3
|
||||||
certifi==2020.12.5
|
certifi==2020.12.5
|
||||||
MechanicalSoup==1.1.0
|
MechanicalSoup==1.1.0
|
||||||
|
opencc==1.1.1
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscra
|
|||||||
pip install pyquery pyinstaller
|
pip install pyquery pyinstaller
|
||||||
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
|
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
|
||||||
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
|
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
|
||||||
|
--add-data "$(python3.8 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
|
||||||
--add-data "Img:Img" \
|
--add-data "Img:Img" \
|
||||||
--add-data "config.ini:." \
|
--add-data "config.ini:." \
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ pip3 install -r requirements.txt
|
|||||||
pip3 install cloudscraper==1.2.52
|
pip3 install cloudscraper==1.2.52
|
||||||
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
|
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
|
||||||
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
|
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
|
||||||
|
--add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
|
||||||
--add-data "Img:Img" \
|
--add-data "Img:Img" \
|
||||||
--add-data "config.ini:." \
|
--add-data "config.ini:." \
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user