diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 289c88e..56e0068 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -42,6 +42,7 @@ jobs: --hidden-import ADC_function.py \ --hidden-import core.py \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ --add-data "Img:Img" \ --add-data "config.ini:." \ @@ -53,6 +54,7 @@ jobs: --hidden-import ADC_function.py ` --hidden-import core.py ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` + --add-data "$(python -c 'import opencc as _; print(_.__path__[0])' | tail -n 1);opencc" ` --add-data "Img;Img" ` --add-data "config.ini;." ` diff --git a/ADC_function.py b/ADC_function.py index 4da8909..2198939 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -14,6 +14,7 @@ from urllib.parse import urljoin import mechanicalsoup from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from cloudscraper import create_scraper def getXpathSingle(htmlcode, xpath): @@ -22,10 +23,10 @@ def getXpathSingle(htmlcode, xpath): return result1 -G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36' # 网页请求核心 -def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): verify = config.getInstance().cacert_file() configProxy = config.getInstance().proxy() errors = "" @@ -41,13 +42,12 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None) else: result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies) - result.encoding = "utf-8" - if return_type == "object": return result elif return_type == "content": return result.content else: + result.encoding = encoding or "utf-8" return result.text except requests.exceptions.ProxyError: print("[-]Proxy error! Please check your Proxy") @@ -98,59 +98,150 @@ class TimeoutHTTPAdapter(HTTPAdapter): kwargs["timeout"] = self.timeout return super().send(request, **kwargs) -def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): + +# with keep-alive feature +def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): + configProxy = config.getInstance().proxy() + session = requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + if configProxy.enable: + session.verify = config.getInstance().cacert_file() + session.proxies = configProxy.proxies() + headers = {"User-Agent": ua or G_USER_AGENT} + session.headers = headers + try: + if isinstance(url, str) and len(url): + result = session.get(str(url)) + else: # 空url参数直接返回可重用session对象,无需设置return_type + return session + if not result.ok: + return None + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "session": + return result, session + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_session() Proxy error! Please check your Proxy") + except Exception as e: + print(f"[-]get_html_session() failed. {e}") + return None + + +def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): configProxy = config.getInstance().proxy() s = requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: + s.verify = config.getInstance().cacert_file() s.proxies = configProxy.proxies() - browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) - result = browser.open(url) - if not result.ok: - return '' - result.encoding = "utf-8" - if return_type == "object": - return result - elif return_type == "content": - return result.content - elif return_type == "browser": - return result, browser - else: - return result.text + try: + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) + if isinstance(url, str) and len(url): + result = browser.open(url) + else: + return browser + if not result.ok: + return None + + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "browser": + return result, browser + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_browser() Proxy error! Please check your Proxy") + except Exception as e: + print(f'[-]get_html_by_browser() Failed! {e}') + return None -def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): configProxy = config.getInstance().proxy() s = requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: + s.verify = config.getInstance().cacert_file() s.proxies = configProxy.proxies() - browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) - result = browser.open(url) - if not result.ok: - return '' - form = browser.select_form() if form_select is None else browser.select_form(form_select) - if isinstance(fields, dict): - for k, v in fields.items(): - browser[k] = v - response = browser.submit_selected() - response.encoding = "utf-8" - if return_type == "object": - return response - elif return_type == "content": - return response.content - elif return_type == "browser": - return response, browser - else: - return response.text + try: + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) + result = browser.open(url) + if not result.ok: + return None + form = browser.select_form() if form_select is None else browser.select_form(form_select) + if isinstance(fields, dict): + for k, v in fields.items(): + browser[k] = v + response = browser.submit_selected() + + if return_type == "object": + return response + elif return_type == "content": + return response.content + elif return_type == "browser": + return response, browser + else: + result.encoding = encoding or "utf-8" + return response.text + except requests.exceptions.ProxyError: + print("[-]get_html_by_form() Proxy error! Please check your Proxy") + except Exception as e: + print(f'[-]get_html_by_form() Failed! {e}') + return None + + +def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): + configProxy = config.getInstance().proxy() + session = create_scraper(browser={'custom': ua or G_USER_AGENT,}) + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + if configProxy.enable: + session.verify = config.getInstance().cacert_file() + session.proxies = configProxy.proxies() + try: + if isinstance(url, str) and len(url): + result = session.get(str(url)) + else: # 空url参数直接返回可重用scraper对象,无需设置return_type + return session + if not result.ok: + return None + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "scraper": + return result, session + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_session() Proxy error! Please check your Proxy") + except Exception as e: + print(f"[-]get_html_session() failed. {e}") + return None # def get_javlib_cookie() -> [dict, str]: @@ -645,3 +736,37 @@ def file_not_exist_or_empty(filepath) -> bool: # 日语简单检测 def is_japanese(s) -> bool: return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE)) + + +# Usage: python ./ADC_function.py https://cn.bing.com/ +if __name__ == "__main__": + import sys, timeit + from http.client import HTTPConnection + def benchmark(t, url): + print(f"HTTP GET Benchmark times:{t} url:{url}") + tm = timeit.timeit(f"_ = session1.get('{url}')", + "from __main__ import get_html_session;session1=get_html_session()", + number=t) + print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable') + tm = timeit.timeit(f"_ = scraper1.get('{url}')", + "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", + number=t) + print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable') + tm = timeit.timeit(f"_ = browser1.open('{url}')", + "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", + number=t) + print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable') + tm = timeit.timeit(f"_ = get_html('{url}')", + "from __main__ import get_html", + number=t) + print(f' *{tm:>10.5f}s get_html()') + t = 100 + #url = "https://www.189.cn/" + url = "http://www.chinaunicom.com" + HTTPConnection.debuglevel = 1 + s = get_html_session() + _ = s.get(url) + HTTPConnection.debuglevel = 0 + if len(sys.argv)>1: + url = sys.argv[1] + benchmark(t, url) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index d08b5e9..afc56ea 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -7,6 +7,7 @@ import shutil import typing import urllib3 import signal +from opencc import OpenCC import config from datetime import datetime, timedelta @@ -377,7 +378,7 @@ def rm_empty_folder(path): pass -def create_data_and_move(file_path: str, zero_op): +def create_data_and_move(file_path: str, zero_op, oCC): # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 debug = config.getInstance().debug() n_number = get_number(debug, os.path.basename(file_path)) @@ -388,7 +389,7 @@ def create_data_and_move(file_path: str, zero_op): if zero_op: return if n_number: - core_main(file_path, n_number) + core_main(file_path, n_number, oCC) else: print("[-] number empty ERROR") moveFailedFolder(file_path) @@ -399,7 +400,7 @@ def create_data_and_move(file_path: str, zero_op): if zero_op: return if n_number: - core_main(file_path, n_number) + core_main(file_path, n_number, oCC) else: raise ValueError("number empty") print("[*]======================================================") @@ -413,13 +414,13 @@ def create_data_and_move(file_path: str, zero_op): print('[!]', err) -def create_data_and_move_with_custom_number(file_path: str, custom_number): +def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC): conf = config.getInstance() file_name = os.path.basename(file_path) try: print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number)) if custom_number: - core_main(file_path, custom_number) + core_main(file_path, custom_number, oCC) else: print("[-] number empty ERROR") print("[*]======================================================") @@ -488,12 +489,21 @@ def main(): create_failed_folder(conf.failed_folder()) + # create OpenCC converter + ccm = conf.cc_convert_mode() + try: + oCC = None if ccm == 0 else OpenCC('t2s.json' if ccm == 1 else 's2t.json') + except: + # some OS no OpennCC cpython, try opencc-python-reimplemented. + # pip uninstall opencc && pip install opencc-python-reimplemented + oCC = None if ccm == 0 else OpenCC('t2s' if ccm == 1 else 's2t') + if not single_file_path == '': #Single File print('[+]==================== Single File =====================') if custom_number == '': - create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path))) + create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)), oCC) else: - create_data_and_move_with_custom_number(single_file_path, custom_number) + create_data_and_move_with_custom_number(single_file_path, custom_number, oCC) else: folder_path = conf.source_folder() if not isinstance(folder_path, str) or folder_path == '': @@ -515,7 +525,7 @@ def main(): count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) - create_data_and_move(movie_path, zero_op) + create_data_and_move(movie_path, zero_op, oCC) if count >= stop_count: print("[!]Stop counter triggered!") break diff --git a/Makefile b/Makefile index 4c8960a..fdc4e2a 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,8 @@ make: @echo "[+]Pyinstaller make" pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \ + --add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \ --add-data "Img:Img" \ --add-data "config.ini:." \ diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index 8a1d67f..275a86a 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测 return True -def get_data_from_json(file_number): # 从JSON返回元数据 +def get_data_from_json(file_number, oCC): # 从JSON返回元数据 """ iterate through all services and fetch the data """ @@ -290,6 +290,20 @@ def get_data_from_json(file_number): # 从JSON返回元数据 if len(t): json_data[translate_value] = special_characters_replacement(t) + if oCC: + cc_vars = conf.cc_convert_vars().split(",") + for cc in cc_vars: + if cc == "actor": + json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']] + json_data['actor'] = oCC.convert(json_data['actor']) + elif cc == "tag": + json_data[cc] = [oCC.convert(t) for t in json_data[cc]] + else: + try: + json_data[cc] = oCC.convert(json_data[cc]) + except: + pass + naming_rule="" for i in conf.naming_rule().split("+"): if i not in json_data: @@ -314,4 +328,6 @@ def special_characters_replacement(text) -> str: replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK - replace('&', '&')) + replace('…','…'). + replace('&', '&') + ) diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index 790b910..02b5d5c 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -6,17 +6,16 @@ import re from ADC_function import * from WebCrawler.storyline import getStoryline + +G_SITE = 'https://www.caribbeancom.com' + + def main(number: str) -> json: try: - # 因演员图片功能还未使用,为提速暂时注释,改为用get_html() - #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html', - # return_type='browser') - #if not r.ok: - # raise ValueError("page not found") - #htmlcode = str(browser.page) - htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content') - htmlcode = htmlbyte.decode('euc-jp') - if not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode: + url = f'{G_SITE}/moviepages/{number}/index.html' + result, session = get_html_session(url, return_type='session') + htmlcode = result.content.decode('euc-jp') + if not result or not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode: raise ValueError("page not found") lx = html.fromstring(htmlcode) @@ -32,13 +31,13 @@ def main(number: str) -> json: 'actor': get_actor(lx), 'release': get_release(lx), 'number': number, - 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', + 'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg', 'tag': get_tag(lx), 'extrafanart': get_extrafanart(lx), 'label': get_series(lx), 'imagecut': 1, -# 'actor_photo': get_actor_photo(browser), - 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', +# 'actor_photo': get_actor_photo(lx, session), + 'website': f'{G_SITE}/moviepages/{number}/index.html', 'source': 'carib.py', 'series': get_series(lx), } @@ -101,24 +100,25 @@ def get_series(lx: html.HtmlElement) -> str: return '' def get_runtime(lx: html.HtmlElement) -> str: - return str(lx.xpath( "//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() + return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip() -def get_actor_photo(browser): - htmla = browser.page.select('#moviepages > div > div:nth-child(1) > div.movie-info.section > ul > li:nth-child(1) > span.spec-content > a') +def get_actor_photo(lx, session): + htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") + names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") t = {} - for a in htmla: - if a.text.strip() == '他': + for name, a in zip(names, htmla): + if name.strip() == '他': continue - p = {a.text.strip(): a['href']} + p = {name.strip(): a.attrib['href']} t.update(p) o = {} for k, v in t.items(): if '/search_act/' not in v: continue - r = browser.open_relative(v) + r = session.get(urljoin(G_SITE, v)) if not r.ok: continue - html = browser.page.prettify() + html = r.text pos = html.find('.full-bg') if pos<0: continue @@ -126,7 +126,7 @@ def get_actor_photo(browser): cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) if not cssBGjpgs or not len(cssBGjpgs[0]): continue - p = {k: urljoin(browser.url, cssBGjpgs[0])} + p = {k: urljoin(r.url, cssBGjpgs[0])} o.update(p) return o diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 7866052..d61db8d 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -5,6 +5,7 @@ from lxml import etree#need install import json from ADC_function import * from WebCrawler.storyline import getStoryline +import inspect def getActorPhoto(html): actors = html.xpath('//div[@class="star-name"]/a') @@ -60,6 +61,8 @@ def getCID(html): result = re.sub('/.*?.jpg','',string) return result def getOutline(number, title): #获取剧情介绍 多进程并发查询 + if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): + return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 return getStoryline(number,title) def getSeriseJa(html): x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') @@ -115,8 +118,15 @@ def main_uncensored(number): def main(number): try: try: + url = "https://www." + secrets.choice([ + 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', + 'cdnbus.fun', + 'dmmbus.fun', 'dmmsee.fun', + 'fanbus.us', + 'seedmm.fun', + ]) + "/" try: - htmlcode = get_html('https://www.fanbus.us/' + number) + htmlcode = get_html(url + number) except: htmlcode = get_html('https://www.javbus.com/' + number) if "<title>404 Page Not Found" in htmlcode: diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index e4e803c..9adb7f9 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -4,7 +4,6 @@ import re from lxml import etree import json from ADC_function import * -from mechanicalsoup.stateful_browser import StatefulBrowser from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) @@ -30,8 +29,8 @@ def getActor(html): idx = idx + 1 return r -def getaphoto(url, browser): - html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url) +def getaphoto(url, session): + html_page = session.get(url).text if session is not None else get_html(url) img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)') img_url = img_prether.findall(html_page) if img_url: @@ -39,7 +38,7 @@ def getaphoto(url, browser): else: return '' -def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img +def getActorPhoto(html, javdb_site, session): actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]') if not actorall: return {} @@ -47,7 +46,7 @@ def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img actor_photo = {} for i in actorall: if i.text in a: - actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser) + actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session) return actor_photo def getStudio(a, html): @@ -178,15 +177,6 @@ def getDirector(html): result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 - try: - htmlcode = get_html('https://cn.airav.wiki/video/' + number) - from WebCrawler.airav import getOutline as airav_getOutline - result = airav_getOutline(htmlcode) - return result - except: - pass - return '' def getOutline(number, title): #获取剧情介绍 多进程并发查询 return getStoryline(number,title) def getSeries(html): @@ -224,15 +214,22 @@ def main(number): javdb_site = secrets.choice(javdb_sites) if debug: print(f'[!]javdb:select site {javdb_site}') - browser = None + session = None + javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' try: - javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' - res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser') - if not res.ok: + if debug: + raise # try get_html_by_scraper() branch + res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session') + if not res: raise query_result = res.text except: - query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies) + res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper') + if not res: + raise ValueError('page not found') + query_result = res.text + if session is None: + raise ValueError('page not found') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for @@ -251,13 +248,12 @@ def main(number): raise ValueError("number not found") correct_url = urls[0] try: - if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive - detail_page = browser.open_relative(correct_url).text - else: - javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url - detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) + # get faster benefit from http keep-alive + javdb_detail_url = urljoin(res.url, correct_url) + detail_page = session.get(javdb_detail_url).text except: detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) + session = None # etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用 lx = etree.fromstring(detail_page, etree.HTMLParser()) @@ -303,8 +299,8 @@ def main(number): 'tag': getTag(lx), 'label': getLabel(lx), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), -# 'actor_photo': getActorPhoto(lx, javdb_site, browser), - 'website': 'https://javdb.com' + correct_url, +# 'actor_photo': getActorPhoto(lx, javdb_site, session), + 'website': urljoin('https://javdb.com', correct_url), 'source': 'javdb.py', 'series': getSeries(lx), @@ -318,7 +314,7 @@ def main(number): except Exception as e: - if config.getInstance().debug(): + if debug: print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -333,12 +329,12 @@ if __name__ == "__main__": # print(main('BANK-022')) # print(main('070116-197')) # print(main('093021_539')) # 没有剧照 片商pacopacomama - # print(main('FC2-2278260')) + print(main('FC2-2278260')) # print(main('FC2-735670')) # print(main('FC2-1174949')) # not found print(main('MVSD-439')) # print(main('EHM0001')) # not found - # print(main('FC2-2314275')) + print(main('FC2-2314275')) # print(main('EBOD-646')) # print(main('LOVE-262')) print(main('ABP-890')) diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py index a9fb8a2..9444473 100644 --- a/WebCrawler/storyline.py +++ b/WebCrawler/storyline.py @@ -4,13 +4,14 @@ import re import json import builtins from ADC_function import * +from lxml.html import fromstring from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool from difflib import SequenceMatcher from unicodedata import category from number_parser import is_uncensored -G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"} +G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"} G_mode_txt = ('顺序执行','线程池','进程池') @@ -27,6 +28,8 @@ class noThread(object): def getStoryline(number, title, sites: list=None): start_time = time.time() conf = config.getInstance() + if not conf.is_storyline(): + return '' debug = conf.debug() or conf.storyline_show() == 2 storyine_sites = conf.storyline_site().split(',') if sites is None else sites if is_uncensored(number): @@ -49,82 +52,87 @@ def getStoryline(number, title, sites: list=None): run_mode = conf.storyline_mode() assert run_mode in (0,1,2) with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool: - result = pool.map(getStoryline_mp, mp_args) + results = pool.map(getStoryline_mp, mp_args) if not debug and conf.storyline_show() == 0: - for value in result: + for value in results: if isinstance(value, str) and len(value): return value return '' # 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示 - cnt = len(apply_sites) - s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' + s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' first = True sel = '' - for i in range(cnt): - sl = len(result[i])if isinstance(result[i], str) else 0 + for site, desc in zip(apply_sites, results): + sl = len(desc) if isinstance(desc, str) else 0 if sl and first: - s += f',[选中{apply_sites[i]}字数:{sl}]' + s += f',[选中{site}字数:{sl}]' first = False - sel = result[i] + sel = desc elif sl: - s += f',{apply_sites[i]}字数:{sl}' + s += f',{site}字数:{sl}' else: - s += f',{apply_sites[i]}:空' + s += f',{site}:空' print(s) return sel def getStoryline_mp(args): - return _getStoryline_mp(*args) - - -# 注:新进程的print()不会写入日志中,将来调试修复失效数据源需直接查看标准输出,issue信息需截图屏幕 -def _getStoryline_mp(site, number, title, debug): - start_time = time.time() - storyline = None - if not isinstance(site, str): + def _inner(site, number, title, debug): + start_time = time.time() + storyline = None + if not isinstance(site, str): + return storyline + elif site == "airavwiki": + storyline = getStoryline_airavwiki(number, debug) + elif site == "airav": + storyline = getStoryline_airav(number, debug) + elif site == "avno1": + storyline = getStoryline_avno1(number, debug) + elif site == "xcity": + storyline = getStoryline_xcity(number, debug) + elif site == "amazon": + storyline = getStoryline_amazon(title, number, debug) + elif site == "58avgo": + storyline = getStoryline_58avgo(number, debug) + if not debug: + return storyline + # 进程池模式的子进程getStoryline_*()的print()不会写入日志中,线程池和顺序执行不受影响 + print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( + site, + time.time() - start_time, + time.strftime("%H:%M:%S"), + storyline if isinstance(storyline, str) and len(storyline) else '[空]') + ) return storyline - elif site == "airav": - storyline = getStoryline_airav(number, debug) - elif site == "avno1": - storyline = getStoryline_avno1(number, debug) - elif site == "xcity": - storyline = getStoryline_xcity(number, debug) - elif site == "amazon": - storyline = getStoryline_amazon(title, number, debug) - elif site == "58avgo": - storyline = getStoryline_58avgo(number, debug) - if not debug: - return storyline - print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( - site, - time.time() - start_time, - time.strftime("%H:%M:%S"), - storyline if isinstance(storyline, str) and len(storyline) else '[空]') - ) - return storyline + return _inner(*args) def getStoryline_airav(number, debug): try: - number_up = number site = secrets.choice(('airav.cc','airav4.club')) url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' - res, browser = get_html_by_browser(url, return_type='browser') - if not res.ok: - raise ValueError(f"get_html_by_browser('{url}') failed") - avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div') - if number_up not in avs.select_one('a > h3').text.upper(): + res, session = get_html_session(url, return_type='session') + if not res: + raise ValueError(f"get_html_by_session('{url}') failed") + lx = fromstring(res.text) + urls = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/@href') + txts = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/h3[@class="one_name ga_name"]/text()') + detail_url = None + for txt, url in zip(txts, urls): + if re.search(number, txt, re.I): + detail_url = urljoin(res.url, url) + break + if detail_url is None: raise ValueError("number not found") - detail_url = avs.select_one('a')['href'] - res = browser.open_relative(detail_url) + res = session.get(detail_url) if not res.ok: - raise ValueError(f"browser.open_relative('{detail_url}') failed") - t = browser.page.select_one('head > title').text - airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper() - if number.upper() != airav_number: + raise ValueError(f"session.get('{detail_url}') failed") + lx = fromstring(res.text) + t = str(lx.xpath('/html/head/title/text()')[0]).strip() + airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]) + if not re.search(number, airav_number, re.I): raise ValueError(f"page number ->[{airav_number}] not match") - desc = browser.page.select_one('li.introduction > span').text.strip() + desc = str(lx.xpath('//span[@id="ContentPlaceHolder1_Label2"]/text()')[0]).strip() return desc except Exception as e: if debug: @@ -133,6 +141,43 @@ def getStoryline_airav(number, debug): return None +def getStoryline_airavwiki(number, debug): + try: + kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number + url = f'https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search={kwd}' + result, session = get_html_session(url, return_type='session') + if not result: + raise ValueError(f"get_html_session('{url}','{number}') failed") + j = json.loads(result.content) + if int(j.get('count')) == 0: + raise ValueError("number not found") + link = None + for r in j["result"]: + n = r['barcode'] + if re.search(number, n, re.I): + link = urljoin(result.url, f'/api/video/barcode/{n}?lng=zh-TW') + break + if link is None: + raise ValueError("number not found") + result = session.get(link) + if not result.ok or not re.search(number, result.url, re.I): + raise ValueError("detail page not found") + j = json.loads(result.content) + if int(j.get('count')) != 1: + raise ValueError("number not found") + detail_number = j["result"]['barcode'] + if not re.search(number, detail_number, re.I): + raise ValueError("detail page number not match, got ->[{detail_number}]") + desc = j["result"]['description'] + return desc + + except Exception as e: + if debug: + print(f"[-]MP getStoryline_airavwiki Error: {e}, number [{number}].") + pass + return '' + + def getStoryline_58avgo(number, debug): try: url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ @@ -143,27 +188,27 @@ def getStoryline_58avgo(number, debug): result, browser = get_html_by_form(url, fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, return_type = 'browser') - if not result.ok: + if not result: raise ValueError(f"get_html_by_form('{url}','{number}') failed") if f'searchresults.aspx?Search={kwd}' not in browser.url: raise ValueError("number not found") s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click') link = None - for i in range(len(s)): - title = s[i].h3.text.strip() + for a in s: + title = a.h3.text.strip() if re.search(number, title, re.I): - link = s[i] + link = a break if link is None: raise ValueError("number not found") result = browser.follow_link(link) if not result.ok or 'playon.aspx' not in browser.url: raise ValueError("detail page not found") - title = browser.page.select('head > title')[0].text.strip() + title = browser.page.select_one('head > title').text.strip() detail_number = str(re.findall('\[(.*?)]', title)[0]) if not re.search(number, detail_number, re.I): raise ValueError("detail page number not match, got ->[{detail_number}]") - return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip() + return browser.page.select_one('#ContentPlaceHolder1_Label2').text.strip() except Exception as e: if debug: print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].") @@ -172,6 +217,29 @@ def getStoryline_58avgo(number, debug): def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 + try: + site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc', + 'hotav.biz','iqq2.xyz','javhq.tv', + 'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',]) + url = f'http://{site}/cn/search.php?kw_type=key&kw={number}' + lx = fromstring(get_html_by_scraper(url)) + descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description') + titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()') + if not descs or not len(descs): + raise ValueError(f"number not found") + for title, desc in zip(titles, descs): + page_number = title[title.rfind(' '):].strip() + if re.search(number, page_number, re.I): + return desc.strip() + raise ValueError(f"page number ->[{page_number}] not match") + except Exception as e: + if debug: + print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 try: url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), @@ -181,14 +249,14 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 form_select='div.wrapper > div.header > div.search > form', fields = {'kw' : number}, return_type = 'browser') - if not result.ok: + if not result: raise ValueError(f"get_html_by_form('{url}','{number}') failed") s = browser.page.select('div.type_movie > div > ul > li > div') - for i in range(len(s)): - title = s[i].a.h3.text.strip() + for div in s: + title = div.a.h3.text.strip() page_number = title[title.rfind(' '):].strip() if re.search(number, page_number, re.I): - return s[i]['data-description'].strip() + return div['data-description'].strip() raise ValueError(f"page number ->[{page_number}] not match") except Exception as e: if debug: @@ -221,41 +289,45 @@ def getStoryline_amazon(q_title, number, debug): if not isinstance(q_title, str) or not len(q_title): return None try: - amazon_cookie, _ = load_cookies('amazon.json') - cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None + cookie, cookies_filepath = load_cookies('amazon.json') url = "https://www.amazon.co.jp/s?k=" + q_title - res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser') - if not res.ok: - raise ValueError("get_html_by_browser() failed") - lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') - if isinstance(lks, list) and len(lks): - browser.follow_link(lks[0]) + res, session = get_html_session(url, cookies=cookie, return_type='session') + if not res: + raise ValueError("get_html_session() failed") + lx = fromstring(res.text) + lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href') + if len(lks) and lks[0].startswith('/'): + res = session.get(urljoin(res.url, lks[0])) cookie = None - html = etree.fromstring(str(browser.page), etree.HTMLParser()) - titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") - urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") + lx = fromstring(res.text) + titles = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") + urls = lx.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") if not len(urls) or len(urls) != len(titles): raise ValueError("titles not found") idx = amazon_select_one(titles, q_title, number, debug) if not isinstance(idx, int) or idx < 0: raise ValueError("title and number not found") - furl = urls[idx] - r = browser.open_relative(furl) - if not r.ok: + furl = urljoin(res.url, urls[idx]) + res = session.get(furl) + if not res.ok: raise ValueError("browser.open_relative()) failed.") - lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') - if isinstance(lks, list) and len(lks): - browser.follow_link(lks[0]) + lx = fromstring(res.text) + lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href') + if len(lks) and lks[0].startswith('/'): + res = session.get(urljoin(res.url, lks[0])) cookie = None - - ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip() - ama_t = re.sub(r'審査番号:\d+', '', ama_t) + lx = fromstring(res.text) + div = lx.xpath('//*[@id="productDescription"]')[0] + ama_t = ' '.join([e.text.strip() for e in div if not re.search('Comment|h3', str(e.tag), re.I) and isinstance(e.text, str)]) + ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip() if cookie is None: - # 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径 + # 删除无效cookies,无论是用户创建还是自动创建,以避免持续故障 + cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True) + # 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径 ama_save = Path.home() / ".local/share/avdc/amazon.json" ama_save.parent.mkdir(parents=True, exist_ok=True) - ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') + ama_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') return ama_t @@ -270,32 +342,31 @@ def amazon_select_one(a_titles, q_title, number, debug): sel = -1 ratio = 0 que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A)) - for loc in range(len(a_titles)): - t = a_titles[loc] - if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 - return loc - if not re.search('DVD|Blu-ray', t, re.I): + for tloc, title in enumerate(a_titles): + if re.search(number, title, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 + return tloc + if not re.search('DVD|Blu-ray', title, re.I): continue - ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I)) + ama_t = str(re.sub('DVD|Blu-ray', "", title, re.I)) ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A)) findlen = 0 lastpos = -1 - cnt = len(ama_t) - for c in reversed(ama_t): - cnt -= 1 - pos = que_t.rfind(c) + for cloc, char in reversed(tuple(enumerate(ama_t))): + pos = que_t.rfind(char) if lastpos >= 0: - pos_near = que_t[:lastpos].rfind(c) + pos_near = que_t[:lastpos].rfind(char) if pos_near < 0: findlen = 0 lastpos = -1 - ama_t = ama_t[:cnt+1] + ama_t = ama_t[:cloc+1] else: pos = pos_near if pos < 0: - if category(c) == 'Nd': + if category(char) == 'Nd': return -1 - ama_t = ama_t[:cnt] + if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U): + return -1 + ama_t = ama_t[:cloc] findlen = 0 lastpos = -1 continue @@ -311,7 +382,7 @@ def amazon_select_one(a_titles, q_title, number, debug): return -1 r = SequenceMatcher(None, ama_t, que_t).ratio() if r > ratio: - sel = loc + sel = tloc ratio = r save_t_ = ama_t if ratio > 0.999: diff --git a/config.ini b/config.ini index 7db538c..e887999 100755 --- a/config.ini +++ b/config.ini @@ -1,4 +1,4 @@ -# 详细教程请看 +# 详细教程请看 # - https://github.com/yoshiko2/AV_Data_Capture/wiki#%E9%85%8D%E7%BD%AEconfigini [common] main_mode=1 @@ -83,24 +83,29 @@ water=2 ; 剧照 [extrafanart] -switch=0 +switch=1 parallel_download=5 extrafanart_folder=extrafanart ; 剧情简介 [storyline] +switch=1 ; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的 ; 可选数据源站点列表。列表内站点同时并发查询,取值优先级由冒号前的序号决定,从小到大,数字小的站点没数据才会采用后面站点获得的。 -; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者 -; 流出破解马赛克的影片(此功能没使用)。 +; 其中airavwiki airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1 airavwiki 有码无码都能查, +; 58avgo只能查无码或者流出破解马赛克的影片(此功能没使用)。 ; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, ; 设置成不查询可大幅提高刮削速度。 ; site= -site=3:avno1 -censored_site=1:airav,4:xcity,5:amazon -uncensored_site=2:58avgo +site=1:avno1,4:airavwiki +censored_site=2:airav,5:xcity,6:amazon +uncensored_site=3:58avgo ; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) run_mode=1 ; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 show_result=0 +; 繁简转换 繁简转换模式mode=0:不转换 1:繁转简 2:简转繁 +[cc_convert] +mode=1 +vars=actor,director,label,outline,series,studio,tag,title diff --git a/config.py b/config.py index 138c9e1..1047961 100644 --- a/config.py +++ b/config.py @@ -246,23 +246,29 @@ class Config: def debug(self) -> bool: return self.getboolean_override("debug_mode", "switch") + def is_storyline(self) -> bool: + try: + return self.conf.getboolean("storyline", "switch") + except: + return True + def storyline_site(self) -> str: try: return self.conf.get("storyline", "site") except: - return "avno1" + return "1:avno1,4:airavwiki" def storyline_censored_site(self) -> str: try: return self.conf.get("storyline", "censored_site") except: - return "airav,xcity,amazon" + return "2:airav,5:xcity,6:amazon" def storyline_uncensored_site(self) -> str: try: return self.conf.get("storyline", "uncensored_site") except: - return "58avgo" + return "3:58avgo" def storyline_show(self) -> int: try: @@ -278,6 +284,19 @@ class Config: except: return 1 + def cc_convert_mode(self) -> int: + try: + v = self.conf.getint("cc_convert", "mode") + return v if v in (0,1,2) else 2 if v > 2 else 0 + except: + return 1 + + def cc_convert_vars(self) -> str: + try: + return self.conf.get("cc_convert", "vars") + except: + return "actor,director,label,outline,series,studio,tag,title" + @staticmethod def _exit(sec: str) -> None: print("[-] Read config error! Please check the {} section in config.ini", sec) @@ -374,11 +393,18 @@ class Config: sec14 = "storyline" conf.add_section(sec14) - conf.set(sec14, "site", "avno1") - conf.set(sec14, "censored_site", "airav,xcity,amazon") - conf.set(sec14, "uncensored_site", "58avgo") + conf.set(sec14, "switch", 1) + conf.set(sec14, "site", "1:avno1,4:airavwiki") + conf.set(sec14, "censored_site", "2:airav,5:xcity,6:amazon") + conf.set(sec14, "uncensored_site", "3:58avgo") conf.set(sec14, "show_result", 0) conf.set(sec14, "run_mode", 1) + conf.set(sec14, "cc_convert", 1) + + sec15 = "cc_convert" + conf.add_section(sec15) + conf.set(sec15, "mode", 1) + conf.set(sec15, "vars", "actor,director,label,outline,series,studio,tag,title") return conf diff --git a/core.py b/core.py index 0e662d1..069c327 100755 --- a/core.py +++ b/core.py @@ -217,14 +217,12 @@ def extrafanart_download_one_by_one(data, path, filepath): print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') def download_one_file(args): - return _download_one_file(*args) - -def _download_one_file(url: str, save_path: Path): - filebytes = get_html(url, return_type='content') - if isinstance(filebytes, bytes) and len(filebytes): - if len(filebytes) == save_path.open('wb').write(filebytes): - return str(save_path) - return None + def _inner(url: str, save_path: Path): + filebytes = get_html(url, return_type='content') + if isinstance(filebytes, bytes) and len(filebytes): + if len(filebytes) == save_path.open('wb').write(filebytes): + return str(save_path) + return _inner(*args) def extrafanart_download_threadpool(url_list, save_dir, number): tm_start = time.perf_counter() @@ -232,11 +230,11 @@ def extrafanart_download_threadpool(url_list, save_dir, number): extrafanart_dir = Path(save_dir) / conf.get_extrafanart() download_only_missing_images = conf.download_only_missing_images() mp_args = [] - for i in range(len(url_list)): - jpg_fullpath = extrafanart_dir / f'extrafanart-{i+1}.jpg' + for i, url in enumerate(url_list, start=1): + jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg' if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): continue - mp_args.append((url_list[i], jpg_fullpath)) + mp_args.append((url, jpg_fullpath)) if not len(mp_args): return extrafanart_dir.mkdir(parents=True, exist_ok=True) @@ -246,11 +244,11 @@ def extrafanart_download_threadpool(url_list, save_dir, number): with ThreadPoolExecutor(parallel) as pool: result = list(pool.map(download_one_file, mp_args)) failed = 0 - for i in range(len(result)): - if not result[i]: - print(f'[-]Extrafanart {i+1} for [{number}] download failed!') + for i, r in enumerate(result, start=1): + if not r: failed += 1 - if not all(result): # 非致命错误,电影不移入失败文件夹,将来可以用模式3补齐 + print(f'[-]Extrafanart {i} for [{number}] download failed!') + if failed: # 非致命错误,电影不移入失败文件夹,将来可以用模式3补齐 print(f"[-]Failed downloaded {failed}/{len(result)} extrafanart images for [{number}] to '{extrafanart_dir}', you may retry run mode 3 later.") else: print(f"[+]Successfully downloaded {len(result)} extrafanart to '{extrafanart_dir}'") @@ -574,7 +572,7 @@ def debug_print(data: json): pass -def core_main(file_path, number_th): +def core_main(file_path, number_th, oCC): conf = config.getInstance() # =======================================================================初始化所需变量 multi_part = 0 @@ -589,7 +587,7 @@ def core_main(file_path, number_th): # 下面被注释的变量不需要 #rootpath= os.getcwd number = number_th - json_data = get_data_from_json(number) # 定义番号 + json_data = get_data_from_json(number, oCC) # 定义番号 # Return if blank dict returned (data not found) if not json_data: diff --git a/py_to_exe.ps1 b/py_to_exe.ps1 index 77f169a..f45e98d 100644 --- a/py_to_exe.ps1 +++ b/py_to_exe.ps1 @@ -2,6 +2,7 @@ # Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force $CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1) +$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1) mkdir build mkdir __pycache__ @@ -10,6 +11,7 @@ pyinstaller --onefile AV_Data_Capture.py ` --hidden-import ADC_function.py ` --hidden-import core.py ` --add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` + --add-data "$OPENCC_PATH;opencc" ` --add-data "Img;Img" ` --add-data "config.ini;." ` diff --git a/requirements.txt b/requirements.txt index dced944..91bfa8c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ pysocks==1.7.1 urllib3==1.24.3 certifi==2020.12.5 MechanicalSoup==1.1.0 +opencc==1.1.1 diff --git a/wrapper/FreeBSD.sh b/wrapper/FreeBSD.sh index 9717ef4..98f25f6 100755 --- a/wrapper/FreeBSD.sh +++ b/wrapper/FreeBSD.sh @@ -2,6 +2,7 @@ pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscra pip install pyquery pyinstaller pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "$(python3.8 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ --add-data "Img:Img" \ --add-data "config.ini:." \ diff --git a/wrapper/Linux.sh b/wrapper/Linux.sh index 63e3b1c..043c8bf 100755 --- a/wrapper/Linux.sh +++ b/wrapper/Linux.sh @@ -14,6 +14,7 @@ pip3 install -r requirements.txt pip3 install cloudscraper==1.2.52 pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "$(python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1):opencc" \ --add-data "Img:Img" \ --add-data "config.ini:." \