Merge pull request #629 from lededev/enumerate-1

storyline:add data source airavwiki
This commit is contained in:
Yoshiko2
2021-11-04 22:18:35 +08:00
committed by GitHub
16 changed files with 494 additions and 228 deletions

View File

@@ -42,6 +42,7 @@ jobs:
--hidden-import ADC_function.py \ --hidden-import ADC_function.py \
--hidden-import core.py \ --hidden-import core.py \
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \
@@ -53,6 +54,7 @@ jobs:
--hidden-import ADC_function.py ` --hidden-import ADC_function.py `
--hidden-import core.py ` --hidden-import core.py `
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
--add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" `
--add-data "Img;Img" ` --add-data "Img;Img" `
--add-data "config.ini;." ` --add-data "config.ini;." `

View File

@@ -14,6 +14,7 @@ from urllib.parse import urljoin
import mechanicalsoup import mechanicalsoup
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from cloudscraper import create_scraper
def getXpathSingle(htmlcode, xpath): def getXpathSingle(htmlcode, xpath):
@@ -22,10 +23,10 @@ def getXpathSingle(htmlcode, xpath):
return result1 return result1
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
# 网页请求核心 # 网页请求核心
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
verify = config.getInstance().cacert_file() verify = config.getInstance().cacert_file()
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
errors = "" errors = ""
@@ -41,13 +42,12 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None)
else: else:
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies) result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
result.encoding = "utf-8"
if return_type == "object": if return_type == "object":
return result return result
elif return_type == "content": elif return_type == "content":
return result.content return result.content
else: else:
result.encoding = encoding or "utf-8"
return result.text return result.text
except requests.exceptions.ProxyError: except requests.exceptions.ProxyError:
print("[-]Proxy error! Please check your Proxy") print("[-]Proxy error! Please check your Proxy")
@@ -98,59 +98,150 @@ class TimeoutHTTPAdapter(HTTPAdapter):
kwargs["timeout"] = self.timeout kwargs["timeout"] = self.timeout
return super().send(request, **kwargs) return super().send(request, **kwargs)
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
# with keep-alive feature
def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
configProxy = config.getInstance().proxy()
session = requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable:
session.verify = config.getInstance().cacert_file()
session.proxies = configProxy.proxies()
headers = {"User-Agent": ua or G_USER_AGENT}
session.headers = headers
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用session对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "session":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
s = requests.Session() s = requests.Session()
if isinstance(cookies, dict) and len(cookies): if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(s.cookies, cookies) requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable: if configProxy.enable:
s.verify = config.getInstance().cacert_file()
s.proxies = configProxy.proxies() s.proxies = configProxy.proxies()
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) try:
result = browser.open(url) browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
if not result.ok: if isinstance(url, str) and len(url):
return '' result = browser.open(url)
result.encoding = "utf-8" else:
if return_type == "object": return browser
return result if not result.ok:
elif return_type == "content": return None
return result.content
elif return_type == "browser": if return_type == "object":
return result, browser return result
else: elif return_type == "content":
return result.text return result.content
elif return_type == "browser":
return result, browser
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_browser() Failed! {e}')
return None
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
configProxy = config.getInstance().proxy() configProxy = config.getInstance().proxy()
s = requests.Session() s = requests.Session()
if isinstance(cookies, dict) and len(cookies): if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(s.cookies, cookies) requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable: if configProxy.enable:
s.verify = config.getInstance().cacert_file()
s.proxies = configProxy.proxies() s.proxies = configProxy.proxies()
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) try:
result = browser.open(url) browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
if not result.ok: result = browser.open(url)
return '' if not result.ok:
form = browser.select_form() if form_select is None else browser.select_form(form_select) return None
if isinstance(fields, dict): form = browser.select_form() if form_select is None else browser.select_form(form_select)
for k, v in fields.items(): if isinstance(fields, dict):
browser[k] = v for k, v in fields.items():
response = browser.submit_selected() browser[k] = v
response.encoding = "utf-8" response = browser.submit_selected()
if return_type == "object":
return response if return_type == "object":
elif return_type == "content": return response
return response.content elif return_type == "content":
elif return_type == "browser": return response.content
return response, browser elif return_type == "browser":
else: return response, browser
return response.text else:
result.encoding = encoding or "utf-8"
return response.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_form() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_form() Failed! {e}')
return None
def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
configProxy = config.getInstance().proxy()
session = create_scraper(browser={'custom': ua or G_USER_AGENT,})
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
if configProxy.enable:
session.verify = config.getInstance().cacert_file()
session.proxies = configProxy.proxies()
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用scraper对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "scraper":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
# def get_javlib_cookie() -> [dict, str]: # def get_javlib_cookie() -> [dict, str]:
@@ -645,3 +736,37 @@ def file_not_exist_or_empty(filepath) -> bool:
# 日语简单检测 # 日语简单检测
def is_japanese(s) -> bool: def is_japanese(s) -> bool:
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE)) return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))
# Usage: python ./ADC_function.py https://cn.bing.com/
if __name__ == "__main__":
import sys, timeit
from http.client import HTTPConnection
def benchmark(t, url):
print(f"HTTP GET Benchmark times:{t} url:{url}")
tm = timeit.timeit(f"_ = session1.get('{url}')",
"from __main__ import get_html_session;session1=get_html_session()",
number=t)
print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable')
tm = timeit.timeit(f"_ = scraper1.get('{url}')",
"from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()",
number=t)
print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable')
tm = timeit.timeit(f"_ = browser1.open('{url}')",
"from __main__ import get_html_by_browser;browser1=get_html_by_browser()",
number=t)
print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable')
tm = timeit.timeit(f"_ = get_html('{url}')",
"from __main__ import get_html",
number=t)
print(f' *{tm:>10.5f}s get_html()')
t = 100
#url = "https://www.189.cn/"
url = "http://www.chinaunicom.com"
HTTPConnection.debuglevel = 1
s = get_html_session()
_ = s.get(url)
HTTPConnection.debuglevel = 0
if len(sys.argv)>1:
url = sys.argv[1]
benchmark(t, url)

View File

@@ -7,6 +7,7 @@ import shutil
import typing import typing
import urllib3 import urllib3
import signal import signal
from opencc import OpenCC
import config import config
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -377,7 +378,7 @@ def rm_empty_folder(path):
pass pass
def create_data_and_move(file_path: str, zero_op): def create_data_and_move(file_path: str, zero_op, oCC):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
debug = config.getInstance().debug() debug = config.getInstance().debug()
n_number = get_number(debug, os.path.basename(file_path)) n_number = get_number(debug, os.path.basename(file_path))
@@ -388,7 +389,7 @@ def create_data_and_move(file_path: str, zero_op):
if zero_op: if zero_op:
return return
if n_number: if n_number:
core_main(file_path, n_number) core_main(file_path, n_number, oCC)
else: else:
print("[-] number empty ERROR") print("[-] number empty ERROR")
moveFailedFolder(file_path) moveFailedFolder(file_path)
@@ -399,7 +400,7 @@ def create_data_and_move(file_path: str, zero_op):
if zero_op: if zero_op:
return return
if n_number: if n_number:
core_main(file_path, n_number) core_main(file_path, n_number, oCC)
else: else:
raise ValueError("number empty") raise ValueError("number empty")
print("[*]======================================================") print("[*]======================================================")
@@ -413,13 +414,13 @@ def create_data_and_move(file_path: str, zero_op):
print('[!]', err) print('[!]', err)
def create_data_and_move_with_custom_number(file_path: str, custom_number): def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC):
conf = config.getInstance() conf = config.getInstance()
file_name = os.path.basename(file_path) file_name = os.path.basename(file_path)
try: try:
print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number)) print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
if custom_number: if custom_number:
core_main(file_path, custom_number) core_main(file_path, custom_number, oCC)
else: else:
print("[-] number empty ERROR") print("[-] number empty ERROR")
print("[*]======================================================") print("[*]======================================================")
@@ -488,12 +489,21 @@ def main():
create_failed_folder(conf.failed_folder()) create_failed_folder(conf.failed_folder())
# create OpenCC converter
ccm = conf.cc_convert_mode()
try:
oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json')
except:
# some OS no OpennCC cpython, try opencc-python-reimplemented.
# pip uninstall opencc && pip install opencc-python-reimplemented
oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t')
if not single_file_path == '': #Single File if not single_file_path == '': #Single File
print('[+]==================== Single File =====================') print('[+]==================== Single File =====================')
if custom_number == '': if custom_number == '':
create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path))) create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)), oCC)
else: else:
create_data_and_move_with_custom_number(single_file_path, custom_number) create_data_and_move_with_custom_number(single_file_path, custom_number, oCC)
else: else:
folder_path = conf.source_folder() folder_path = conf.source_folder()
if not isinstance(folder_path, str) or folder_path == '': if not isinstance(folder_path, str) or folder_path == '':
@@ -515,7 +525,7 @@ def main():
count = count + 1 count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%' percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
create_data_and_move(movie_path, zero_op) create_data_and_move(movie_path, zero_op, oCC)
if count >= stop_count: if count >= stop_count:
print("[!]Stop counter triggered!") print("[!]Stop counter triggered!")
break break

View File

@@ -17,6 +17,8 @@ make:
@echo "[+]Pyinstaller make" @echo "[+]Pyinstaller make"
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \
--add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \

View File

@@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
return True return True
def get_data_from_json(file_number): # 从JSON返回元数据 def get_data_from_json(file_number, oCC): # 从JSON返回元数据
""" """
iterate through all services and fetch the data iterate through all services and fetch the data
""" """
@@ -290,6 +290,20 @@ def get_data_from_json(file_number): # 从JSON返回元数据
if len(t): if len(t):
json_data[translate_value] = special_characters_replacement(t) json_data[translate_value] = special_characters_replacement(t)
if oCC:
cc_vars = conf.cc_convert_vars().split(",")
for cc in cc_vars:
if cc == "actor":
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
json_data['actor'] = oCC.convert(json_data['actor'])
elif cc == "tag":
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
else:
try:
json_data[cc] = oCC.convert(json_data[cc])
except:
pass
naming_rule="" naming_rule=""
for i in conf.naming_rule().split("+"): for i in conf.naming_rule().split("+"):
if i not in json_data: if i not in json_data:
@@ -314,4 +328,6 @@ def special_characters_replacement(text) -> str:
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('‘', ''). # U+02018 LEFT SINGLE QUOTATION MARK replace('‘', ''). # U+02018 LEFT SINGLE QUOTATION MARK
replace('’', ''). # U+02019 RIGHT SINGLE QUOTATION MARK replace('’', ''). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&', '')) replace('…','').
replace('&', '')
)

View File

@@ -6,17 +6,16 @@ import re
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline from WebCrawler.storyline import getStoryline
G_SITE = 'https://www.caribbeancom.com'
def main(number: str) -> json: def main(number: str) -> json:
try: try:
# 因演员图片功能还未使用为提速暂时注释改为用get_html() url = f'{G_SITE}/moviepages/{number}/index.html'
#r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html', result, session = get_html_session(url, return_type='session')
# return_type='browser') htmlcode = result.content.decode('euc-jp')
#if not r.ok: if not result or not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
# raise ValueError("page not found")
#htmlcode = str(browser.page)
htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
htmlcode = htmlbyte.decode('euc-jp')
if not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found") raise ValueError("page not found")
lx = html.fromstring(htmlcode) lx = html.fromstring(htmlcode)
@@ -32,13 +31,13 @@ def main(number: str) -> json:
'actor': get_actor(lx), 'actor': get_actor(lx),
'release': get_release(lx), 'release': get_release(lx),
'number': number, 'number': number,
'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', 'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
'tag': get_tag(lx), 'tag': get_tag(lx),
'extrafanart': get_extrafanart(lx), 'extrafanart': get_extrafanart(lx),
'label': get_series(lx), 'label': get_series(lx),
'imagecut': 1, 'imagecut': 1,
# 'actor_photo': get_actor_photo(browser), # 'actor_photo': get_actor_photo(lx, session),
'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', 'website': f'{G_SITE}/moviepages/{number}/index.html',
'source': 'carib.py', 'source': 'carib.py',
'series': get_series(lx), 'series': get_series(lx),
} }
@@ -101,24 +100,25 @@ def get_series(lx: html.HtmlElement) -> str:
return '' return ''
def get_runtime(lx: html.HtmlElement) -> str: def get_runtime(lx: html.HtmlElement) -> str:
return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
def get_actor_photo(browser): def get_actor_photo(lx, session):
htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a') htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
t = {} t = {}
for a in htmla: for name, a in zip(names, htmla):
if a.text.strip() == '': if name.strip() == '':
continue continue
p = {a.text.strip(): a['href']} p = {name.strip(): a.attrib['href']}
t.update(p) t.update(p)
o = {} o = {}
for k, v in t.items(): for k, v in t.items():
if '/search_act/' not in v: if '/search_act/' not in v:
continue continue
r = browser.open_relative(v) r = session.get(urljoin(G_SITE, v))
if not r.ok: if not r.ok:
continue continue
html = browser.page.prettify() html = r.text
pos = html.find('.full-bg') pos = html.find('.full-bg')
if pos<0: if pos<0:
continue continue
@@ -126,7 +126,7 @@ def get_actor_photo(browser):
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]): if not cssBGjpgs or not len(cssBGjpgs[0]):
continue continue
p = {k: urljoin(browser.url, cssBGjpgs[0])} p = {k: urljoin(r.url, cssBGjpgs[0])}
o.update(p) o.update(p)
return o return o

View File

@@ -5,6 +5,7 @@ from lxml import etree#need install
import json import json
from ADC_function import * from ADC_function import *
from WebCrawler.storyline import getStoryline from WebCrawler.storyline import getStoryline
import inspect
def getActorPhoto(html): def getActorPhoto(html):
actors = html.xpath('//div[@class="star-name"]/a') actors = html.xpath('//div[@class="star-name"]/a')
@@ -60,6 +61,8 @@ def getCID(html):
result = re.sub('/.*?.jpg','',string) result = re.sub('/.*?.jpg','',string)
return result return result
def getOutline(number, title): #获取剧情介绍 多进程并发查询 def getOutline(number, title): #获取剧情介绍 多进程并发查询
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
return getStoryline(number,title) return getStoryline(number,title)
def getSeriseJa(html): def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
@@ -115,8 +118,15 @@ def main_uncensored(number):
def main(number): def main(number):
try: try:
try: try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
'dmmbus.fun', 'dmmsee.fun',
'fanbus.us',
'seedmm.fun',
]) + "/"
try: try:
htmlcode = get_html('https://www.fanbus.us/' + number) htmlcode = get_html(url + number)
except: except:
htmlcode = get_html('https://www.javbus.com/' + number) htmlcode = get_html('https://www.javbus.com/' + number)
if "<title>404 Page Not Found" in htmlcode: if "<title>404 Page Not Found" in htmlcode:

View File

@@ -4,7 +4,6 @@ import re
from lxml import etree from lxml import etree
import json import json
from ADC_function import * from ADC_function import *
from mechanicalsoup.stateful_browser import StatefulBrowser
from WebCrawler.storyline import getStoryline from WebCrawler.storyline import getStoryline
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
@@ -30,8 +29,8 @@ def getActor(html):
idx = idx + 1 idx = idx + 1
return r return r
def getaphoto(url, browser): def getaphoto(url, session):
html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url) html_page = session.get(url).text if session is not None else get_html(url)
img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)') img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)')
img_url = img_prether.findall(html_page) img_url = img_prether.findall(html_page)
if img_url: if img_url:
@@ -39,7 +38,7 @@ def getaphoto(url, browser):
else: else:
return '' return ''
def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img def getActorPhoto(html, javdb_site, session):
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]') actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
if not actorall: if not actorall:
return {} return {}
@@ -47,7 +46,7 @@ def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
actor_photo = {} actor_photo = {}
for i in actorall: for i in actorall:
if i.text in a: if i.text in a:
actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser) actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session)
return actor_photo return actor_photo
def getStudio(a, html): def getStudio(a, html):
@@ -178,15 +177,6 @@ def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline0(number): #获取剧情介绍 airav.wiki站点404函数暂时更名等无法恢复时删除
try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
from WebCrawler.airav import getOutline as airav_getOutline
result = airav_getOutline(htmlcode)
return result
except:
pass
return ''
def getOutline(number, title): #获取剧情介绍 多进程并发查询 def getOutline(number, title): #获取剧情介绍 多进程并发查询
return getStoryline(number,title) return getStoryline(number,title)
def getSeries(html): def getSeries(html):
@@ -224,15 +214,22 @@ def main(number):
javdb_site = secrets.choice(javdb_sites) javdb_site = secrets.choice(javdb_sites)
if debug: if debug:
print(f'[!]javdb:select site {javdb_site}') print(f'[!]javdb:select site {javdb_site}')
browser = None session = None
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
try: try:
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' if debug:
res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser') raise # try get_html_by_scraper() branch
if not res.ok: res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
if not res:
raise raise
query_result = res.text query_result = res.text
except: except:
query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies) res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper')
if not res:
raise ValueError('page not found')
query_result = res.text
if session is None:
raise ValueError('page not found')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results, # javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for # and the first elememt maybe not the one we are looking for
@@ -251,13 +248,12 @@ def main(number):
raise ValueError("number not found") raise ValueError("number not found")
correct_url = urls[0] correct_url = urls[0]
try: try:
if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive # get faster benefit from http keep-alive
detail_page = browser.open_relative(correct_url).text javdb_detail_url = urljoin(res.url, correct_url)
else: detail_page = session.get(javdb_detail_url).text
javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url
detail_page = get_html(javdb_detail_url, cookies=javdb_cookies)
except: except:
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
session = None
# etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用 # etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用
lx = etree.fromstring(detail_page, etree.HTMLParser()) lx = etree.fromstring(detail_page, etree.HTMLParser())
@@ -303,8 +299,8 @@ def main(number):
'tag': getTag(lx), 'tag': getTag(lx),
'label': getLabel(lx), 'label': getLabel(lx),
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
# 'actor_photo': getActorPhoto(lx, javdb_site, browser), # 'actor_photo': getActorPhoto(lx, javdb_site, session),
'website': 'https://javdb.com' + correct_url, 'website': urljoin('https://javdb.com', correct_url),
'source': 'javdb.py', 'source': 'javdb.py',
'series': getSeries(lx), 'series': getSeries(lx),
@@ -318,7 +314,7 @@ def main(number):
except Exception as e: except Exception as e:
if config.getInstance().debug(): if debug:
print(e) print(e)
dic = {"title": ""} dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -333,12 +329,12 @@ if __name__ == "__main__":
# print(main('BANK-022')) # print(main('BANK-022'))
# print(main('070116-197')) # print(main('070116-197'))
# print(main('093021_539')) # 没有剧照 片商pacopacomama # print(main('093021_539')) # 没有剧照 片商pacopacomama
# print(main('FC2-2278260')) print(main('FC2-2278260'))
# print(main('FC2-735670')) # print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found # print(main('FC2-1174949')) # not found
print(main('MVSD-439')) print(main('MVSD-439'))
# print(main('EHM0001')) # not found # print(main('EHM0001')) # not found
# print(main('FC2-2314275')) print(main('FC2-2314275'))
# print(main('EBOD-646')) # print(main('EBOD-646'))
# print(main('LOVE-262')) # print(main('LOVE-262'))
print(main('ABP-890')) print(main('ABP-890'))

View File

@@ -4,13 +4,14 @@ import re
import json import json
import builtins import builtins
from ADC_function import * from ADC_function import *
from lxml.html import fromstring
from multiprocessing import Pool from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher from difflib import SequenceMatcher
from unicodedata import category from unicodedata import category
from number_parser import is_uncensored from number_parser import is_uncensored
G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"} G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池','进程池') G_mode_txt = ('顺序执行','线程池','进程池')
@@ -27,6 +28,8 @@ class noThread(object):
def getStoryline(number, title, sites: list=None): def getStoryline(number, title, sites: list=None):
start_time = time.time() start_time = time.time()
conf = config.getInstance() conf = config.getInstance()
if not conf.is_storyline():
return ''
debug = conf.debug() or conf.storyline_show() == 2 debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites storyine_sites = conf.storyline_site().split(',') if sites is None else sites
if is_uncensored(number): if is_uncensored(number):
@@ -49,82 +52,87 @@ def getStoryline(number, title, sites: list=None):
run_mode = conf.storyline_mode() run_mode = conf.storyline_mode()
assert run_mode in (0,1,2) assert run_mode in (0,1,2)
with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool: with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool:
result = pool.map(getStoryline_mp, mp_args) results = pool.map(getStoryline_mp, mp_args)
if not debug and conf.storyline_show() == 0: if not debug and conf.storyline_show() == 0:
for value in result: for value in results:
if isinstance(value, str) and len(value): if isinstance(value, str) and len(value):
return value return value
return '' return ''
# 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示 # 以下debug结果输出会写入日志进程池中的则不会只在标准输出中显示
cnt = len(apply_sites) s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
first = True first = True
sel = '' sel = ''
for i in range(cnt): for site, desc in zip(apply_sites, results):
sl = len(result[i])if isinstance(result[i], str) else 0 sl = len(desc) if isinstance(desc, str) else 0
if sl and first: if sl and first:
s += f'[选中{apply_sites[i]}字数:{sl}]' s += f'[选中{site}字数:{sl}]'
first = False first = False
sel = result[i] sel = desc
elif sl: elif sl:
s += f'{apply_sites[i]}字数:{sl}' s += f'{site}字数:{sl}'
else: else:
s += f'{apply_sites[i]}:空' s += f'{site}:空'
print(s) print(s)
return sel return sel
def getStoryline_mp(args): def getStoryline_mp(args):
return _getStoryline_mp(*args) def _inner(site, number, title, debug):
start_time = time.time()
storyline = None
# 注新进程的print()不会写入日志中将来调试修复失效数据源需直接查看标准输出issue信息需截图屏幕 if not isinstance(site, str):
def _getStoryline_mp(site, number, title, debug): return storyline
start_time = time.time() elif site == "airavwiki":
storyline = None storyline = getStoryline_airavwiki(number, debug)
if not isinstance(site, str): elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
# 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline return storyline
elif site == "airav": return _inner(*args)
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug): def getStoryline_airav(number, debug):
try: try:
number_up = number
site = secrets.choice(('airav.cc','airav4.club')) site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, browser = get_html_by_browser(url, return_type='browser') res, session = get_html_session(url, return_type='session')
if not res.ok: if not res:
raise ValueError(f"get_html_by_browser('{url}') failed") raise ValueError(f"get_html_by_session('{url}') failed")
avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div') lx = fromstring(res.text)
if number_up not in avs.select_one('a > h3').text.upper(): urls = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/@href')
txts = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/h3[@class="one_name ga_name"]/text()')
detail_url = None
for txt, url in zip(txts, urls):
if re.search(number, txt, re.I):
detail_url = urljoin(res.url, url)
break
if detail_url is None:
raise ValueError("number not found") raise ValueError("number not found")
detail_url = avs.select_one('a')['href'] res = session.get(detail_url)
res = browser.open_relative(detail_url)
if not res.ok: if not res.ok:
raise ValueError(f"browser.open_relative('{detail_url}') failed") raise ValueError(f"session.get('{detail_url}') failed")
t = browser.page.select_one('head > title').text lx = fromstring(res.text)
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper() t = str(lx.xpath('/html/head/title/text()')[0]).strip()
if number.upper() != airav_number: airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0])
if not re.search(number, airav_number, re.I):
raise ValueError(f"page number ->[{airav_number}] not match") raise ValueError(f"page number ->[{airav_number}] not match")
desc = browser.page.select_one('li.introduction > span').text.strip() desc = str(lx.xpath('//span[@id="ContentPlaceHolder1_Label2"]/text()')[0]).strip()
return desc return desc
except Exception as e: except Exception as e:
if debug: if debug:
@@ -133,6 +141,43 @@ def getStoryline_airav(number, debug):
return None return None
def getStoryline_airavwiki(number, debug):
try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search={kwd}'
result, session = get_html_session(url, return_type='session')
if not result:
raise ValueError(f"get_html_session('{url}','{number}') failed")
j = json.loads(result.content)
if int(j.get('count')) == 0:
raise ValueError("number not found")
link = None
for r in j["result"]:
n = r['barcode']
if re.search(number, n, re.I):
link = urljoin(result.url, f'/api/video/barcode/{n}?lng=zh-TW')
break
if link is None:
raise ValueError("number not found")
result = session.get(link)
if not result.ok or not re.search(number, result.url, re.I):
raise ValueError("detail page not found")
j = json.loads(result.content)
if int(j.get('count')) != 1:
raise ValueError("number not found")
detail_number = j["result"]['barcode']
if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]")
desc = j["result"]['description']
return desc
except Exception as e:
if debug:
print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].")
pass
return ''
def getStoryline_58avgo(number, debug): def getStoryline_58avgo(number, debug):
try: try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
@@ -143,27 +188,27 @@ def getStoryline_58avgo(number, debug):
result, browser = get_html_by_form(url, result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
return_type = 'browser') return_type = 'browser')
if not result.ok: if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
if f'searchresults.aspx?Search={kwd}' not in browser.url: if f'searchresults.aspx?Search={kwd}' not in browser.url:
raise ValueError("number not found") raise ValueError("number not found")
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click') s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
link = None link = None
for i in range(len(s)): for a in s:
title = s[i].h3.text.strip() title = a.h3.text.strip()
if re.search(number, title, re.I): if re.search(number, title, re.I):
link = s[i] link = a
break break
if link is None: if link is None:
raise ValueError("number not found") raise ValueError("number not found")
result = browser.follow_link(link) result = browser.follow_link(link)
if not result.ok or 'playon.aspx' not in browser.url: if not result.ok or 'playon.aspx' not in browser.url:
raise ValueError("detail page not found") raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip() title = browser.page.select_one('head > title').text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0]) detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I): if not re.search(number, detail_number, re.I):
raise ValueError("detail page number not match, got ->[{detail_number}]") raise ValueError("detail page number not match, got ->[{detail_number}]")
return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip() return browser.page.select_one('#ContentPlaceHolder1_Label2').text.strip()
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].") print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
@@ -172,6 +217,29 @@ def getStoryline_58avgo(number, debug):
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try:
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
'hotav.biz','iqq2.xyz','javhq.tv',
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
lx = fromstring(get_html_by_scraper(url))
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs):
raise ValueError(f"number not found")
for title, desc in zip(titles, descs):
page_number = title[title.rfind(' '):].strip()
if re.search(number, page_number, re.I):
return desc.strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
try: try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
@@ -181,14 +249,14 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
form_select='div.wrapper > div.header > div.search > form', form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number}, fields = {'kw' : number},
return_type = 'browser') return_type = 'browser')
if not result.ok: if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
s = browser.page.select('div.type_movie > div > ul > li > div') s = browser.page.select('div.type_movie > div > ul > li > div')
for i in range(len(s)): for div in s:
title = s[i].a.h3.text.strip() title = div.a.h3.text.strip()
page_number = title[title.rfind(' '):].strip() page_number = title[title.rfind(' '):].strip()
if re.search(number, page_number, re.I): if re.search(number, page_number, re.I):
return s[i]['data-description'].strip() return div['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match") raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e: except Exception as e:
if debug: if debug:
@@ -221,41 +289,45 @@ def getStoryline_amazon(q_title, number, debug):
if not isinstance(q_title, str) or not len(q_title): if not isinstance(q_title, str) or not len(q_title):
return None return None
try: try:
amazon_cookie, _ = load_cookies('amazon.json') cookie, cookies_filepath = load_cookies('amazon.json')
cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
url = "https://www.amazon.co.jp/s?k=" + q_title url = "https://www.amazon.co.jp/s?k=" + q_title
res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser') res, session = get_html_session(url, cookies=cookie, return_type='session')
if not res.ok: if not res:
raise ValueError("get_html_by_browser() failed") raise ValueError("get_html_session() failed")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') lx = fromstring(res.text)
if isinstance(lks, list) and len(lks): lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
browser.follow_link(lks[0]) if len(lks) and lks[0].startswith('/'):
res = session.get(urljoin(res.url, lks[0]))
cookie = None cookie = None
html = etree.fromstring(str(browser.page), etree.HTMLParser()) lx = fromstring(res.text)
titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles): if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found") raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug) idx = amazon_select_one(titles, q_title, number, debug)
if not isinstance(idx, int) or idx < 0: if not isinstance(idx, int) or idx < 0:
raise ValueError("title and number not found") raise ValueError("title and number not found")
furl = urls[idx] furl = urljoin(res.url, urls[idx])
r = browser.open_relative(furl) res = session.get(furl)
if not r.ok: if not res.ok:
raise ValueError("browser.open_relative()) failed.") raise ValueError("browser.open_relative()) failed.")
lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') lx = fromstring(res.text)
if isinstance(lks, list) and len(lks): lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
browser.follow_link(lks[0]) if len(lks) and lks[0].startswith('/'):
res = session.get(urljoin(res.url, lks[0]))
cookie = None cookie = None
lx = fromstring(res.text)
ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip() div = lx.xpath('//*[@id="productDescription"]')[0]
ama_t = re.sub(r'審査番号:\d+', '', ama_t) ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)])
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
if cookie is None: if cookie is None:
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径 # 删除无效cookies无论是用户创建还是自动创建以避免持续故障
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
ama_save = Path.home() / ".local/share/avdc/amazon.json" ama_save = Path.home() / ".local/share/avdc/amazon.json"
ama_save.parent.mkdir(parents=True, exist_ok=True) ama_save.parent.mkdir(parents=True, exist_ok=True)
ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') ama_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
return ama_t return ama_t
@@ -270,32 +342,31 @@ def amazon_select_one(a_titles, q_title, number, debug):
sel = -1 sel = -1
ratio = 0 ratio = 0
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A)) que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
for loc in range(len(a_titles)): for tloc, title in enumerate(a_titles):
t = a_titles[loc] if re.search(number, title, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 return tloc
return loc if not re.search('DVD|Blu-ray', title, re.I):
if not re.search('DVD|Blu-ray', t, re.I):
continue continue
ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I)) ama_t = str(re.sub('DVD|Blu-ray', "", title, re.I))
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A)) ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
findlen = 0 findlen = 0
lastpos = -1 lastpos = -1
cnt = len(ama_t) for cloc, char in reversed(tuple(enumerate(ama_t))):
for c in reversed(ama_t): pos = que_t.rfind(char)
cnt -= 1
pos = que_t.rfind(c)
if lastpos >= 0: if lastpos >= 0:
pos_near = que_t[:lastpos].rfind(c) pos_near = que_t[:lastpos].rfind(char)
if pos_near < 0: if pos_near < 0:
findlen = 0 findlen = 0
lastpos = -1 lastpos = -1
ama_t = ama_t[:cnt+1] ama_t = ama_t[:cloc+1]
else: else:
pos = pos_near pos = pos_near
if pos < 0: if pos < 0:
if category(c) == 'Nd': if category(char) == 'Nd':
return -1 return -1
ama_t = ama_t[:cnt] if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U):
return -1
ama_t = ama_t[:cloc]
findlen = 0 findlen = 0
lastpos = -1 lastpos = -1
continue continue
@@ -311,7 +382,7 @@ def amazon_select_one(a_titles, q_title, number, debug):
return -1 return -1
r = SequenceMatcher(None, ama_t, que_t).ratio() r = SequenceMatcher(None, ama_t, que_t).ratio()
if r > ratio: if r > ratio:
sel = loc sel = tloc
ratio = r ratio = r
save_t_ = ama_t save_t_ = ama_t
if ratio > 0.999: if ratio > 0.999:

View File

@@ -83,24 +83,29 @@ water=2
; 剧照 ; 剧照
[extrafanart] [extrafanart]
switch=0 switch=1
parallel_download=5 parallel_download=5
extrafanart_folder=extrafanart extrafanart_folder=extrafanart
; 剧情简介 ; 剧情简介
[storyline] [storyline]
switch=1
; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的 ; website为javbus javdb avsox xcity carib时site censored_site uncensored_site 为获取剧情简介信息的
; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。 ; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。
; 其中airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1有码无码都能查58avgo只能查无码或者 ; 其中airavwiki airav avno1 58avgo是中文剧情简介区别是airav只能查有码avno1 airavwiki 有码无码都能查,
; 流出破解马赛克的影片(此功能没使用)。 ; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。
; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, ; xcity和amazon是日语的由于amazon商城没有番号信息选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
; 设置成不查询可大幅提高刮削速度。 ; 设置成不查询可大幅提高刮削速度。
; site= ; site=
site=3:avno1 site=1:avno1,4:airavwiki
censored_site=1:airav,4:xcity,5:amazon censored_site=2:airav,5:xcity,6:amazon
uncensored_site=2:58avgo uncensored_site=3:58avgo
; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) ; 运行模式0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
run_mode=1 run_mode=1
; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志)剧情简介失效时可打开2查看原因
show_result=0 show_result=0
; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁
[cc_convert]
mode=1
vars=actor,director,label,outline,series,studio,tag,title

View File

@@ -246,23 +246,29 @@ class Config:
def debug(self) -> bool: def debug(self) -> bool:
return self.getboolean_override("debug_mode", "switch") return self.getboolean_override("debug_mode", "switch")
def is_storyline(self) -> bool:
try:
return self.conf.getboolean("storyline", "switch")
except:
return True
def storyline_site(self) -> str: def storyline_site(self) -> str:
try: try:
return self.conf.get("storyline", "site") return self.conf.get("storyline", "site")
except: except:
return "avno1" return "1:avno1,4:airavwiki"
def storyline_censored_site(self) -> str: def storyline_censored_site(self) -> str:
try: try:
return self.conf.get("storyline", "censored_site") return self.conf.get("storyline", "censored_site")
except: except:
return "airav,xcity,amazon" return "2:airav,5:xcity,6:amazon"
def storyline_uncensored_site(self) -> str: def storyline_uncensored_site(self) -> str:
try: try:
return self.conf.get("storyline", "uncensored_site") return self.conf.get("storyline", "uncensored_site")
except: except:
return "58avgo" return "3:58avgo"
def storyline_show(self) -> int: def storyline_show(self) -> int:
try: try:
@@ -278,6 +284,19 @@ class Config:
except: except:
return 1 return 1
def cc_convert_mode(self) -> int:
try:
v = self.conf.getint("cc_convert", "mode")
return v if v in (0,1,2) else 2 if v > 2 else 0
except:
return 1
def cc_convert_vars(self) -> str:
try:
return self.conf.get("cc_convert", "vars")
except:
return "actor,director,label,outline,series,studio,tag,title"
@staticmethod @staticmethod
def _exit(sec: str) -> None: def _exit(sec: str) -> None:
print("[-] Read config error! Please check the {} section in config.ini", sec) print("[-] Read config error! Please check the {} section in config.ini", sec)
@@ -374,11 +393,18 @@ class Config:
sec14 = "storyline" sec14 = "storyline"
conf.add_section(sec14) conf.add_section(sec14)
conf.set(sec14, "site", "avno1") conf.set(sec14, "switch", 1)
conf.set(sec14, "censored_site", "airav,xcity,amazon") conf.set(sec14, "site", "1:avno1,4:airavwiki")
conf.set(sec14, "uncensored_site", "58avgo") conf.set(sec14, "censored_site", "2:airav,5:xcity,6:amazon")
conf.set(sec14, "uncensored_site", "3:58avgo")
conf.set(sec14, "show_result", 0) conf.set(sec14, "show_result", 0)
conf.set(sec14, "run_mode", 1) conf.set(sec14, "run_mode", 1)
conf.set(sec14, "cc_convert", 1)
sec15 = "cc_convert"
conf.add_section(sec15)
conf.set(sec15, "mode", 1)
conf.set(sec15, "vars", "actor,director,label,outline,series,studio,tag,title")
return conf return conf

32
core.py
View File

@@ -217,14 +217,12 @@ def extrafanart_download_one_by_one(data, path, filepath):
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
def download_one_file(args): def download_one_file(args):
return _download_one_file(*args) def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
def _download_one_file(url: str, save_path: Path): if isinstance(filebytes, bytes) and len(filebytes):
filebytes = get_html(url, return_type='content') if len(filebytes) == save_path.open('wb').write(filebytes):
if isinstance(filebytes, bytes) and len(filebytes): return str(save_path)
if len(filebytes) == save_path.open('wb').write(filebytes): return _inner(*args)
return str(save_path)
return None
def extrafanart_download_threadpool(url_list, save_dir, number): def extrafanart_download_threadpool(url_list, save_dir, number):
tm_start = time.perf_counter() tm_start = time.perf_counter()
@@ -232,11 +230,11 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
extrafanart_dir = Path(save_dir) / conf.get_extrafanart() extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
download_only_missing_images = conf.download_only_missing_images() download_only_missing_images = conf.download_only_missing_images()
mp_args = [] mp_args = []
for i in range(len(url_list)): for i, url in enumerate(url_list, start=1):
jpg_fullpath = extrafanart_dir / f'extrafanart-{i+1}.jpg' jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
continue continue
mp_args.append((url_list[i], jpg_fullpath)) mp_args.append((url, jpg_fullpath))
if not len(mp_args): if not len(mp_args):
return return
extrafanart_dir.mkdir(parents=True, exist_ok=True) extrafanart_dir.mkdir(parents=True, exist_ok=True)
@@ -246,11 +244,11 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
with ThreadPoolExecutor(parallel) as pool: with ThreadPoolExecutor(parallel) as pool:
result = list(pool.map(download_one_file, mp_args)) result = list(pool.map(download_one_file, mp_args))
failed = 0 failed = 0
for i in range(len(result)): for i, r in enumerate(result, start=1):
if not result[i]: if not r:
print(f'[-]Extrafanart {i+1} for [{number}] download failed!')
failed += 1 failed += 1
if not all(result): # 非致命错误电影不移入失败文件夹将来可以用模式3补齐 print(f'[-]Extrafanart {i} for [{number}] download failed!')
if failed: # 非致命错误电影不移入失败文件夹将来可以用模式3补齐
print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.") print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.")
else: else:
print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'") print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'")
@@ -574,7 +572,7 @@ def debug_print(data: json):
pass pass
def core_main(file_path, number_th): def core_main(file_path, number_th, oCC):
conf = config.getInstance() conf = config.getInstance()
# =======================================================================初始化所需变量 # =======================================================================初始化所需变量
multi_part = 0 multi_part = 0
@@ -589,7 +587,7 @@ def core_main(file_path, number_th):
# 下面被注释的变量不需要 # 下面被注释的变量不需要
#rootpath= os.getcwd #rootpath= os.getcwd
number = number_th number = number_th
json_data = get_data_from_json(number) # 定义番号 json_data = get_data_from_json(number, oCC) # 定义番号
# Return if blank dict returned (data not found) # Return if blank dict returned (data not found)
if not json_data: if not json_data:

View File

@@ -2,6 +2,7 @@
# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force # Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1) $CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1)
mkdir build mkdir build
mkdir __pycache__ mkdir __pycache__
@@ -10,6 +11,7 @@ pyinstaller --onefile AV_Data_Capture.py `
--hidden-import ADC_function.py ` --hidden-import ADC_function.py `
--hidden-import core.py ` --hidden-import core.py `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` --add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
--add-data "$OPENCC_PATH;opencc" `
--add-data "Img;Img" ` --add-data "Img;Img" `
--add-data "config.ini;." ` --add-data "config.ini;." `

View File

@@ -8,3 +8,4 @@ pysocks==1.7.1
urllib3==1.24.3 urllib3==1.24.3
certifi==2020.12.5 certifi==2020.12.5
MechanicalSoup==1.1.0 MechanicalSoup==1.1.0
opencc==1.1.1

View File

@@ -2,6 +2,7 @@ pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscra
pip install pyquery pyinstaller pip install pyquery pyinstaller
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python3.8 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \

View File

@@ -14,6 +14,7 @@ pip3 install -r requirements.txt
pip3 install cloudscraper==1.2.52 pip3 install cloudscraper==1.2.52
pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
--add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
--add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \
--add-data "Img:Img" \ --add-data "Img:Img" \
--add-data "config.ini:." \ --add-data "config.ini:." \