diff --git a/ADC_function.py b/ADC_function.py index 346b49c..fcf64f0 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -15,6 +15,7 @@ import mechanicalsoup from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from cloudscraper import create_scraper +from concurrent.futures import ThreadPoolExecutor def getXpathSingle(htmlcode, xpath): @@ -490,9 +491,40 @@ def download_file_with_filename(url, filename, path): raise ValueError('[-]Connect Failed! Please check your Proxy or Network!') return +def download_one_file(args): + def _inner(url: str, save_path: Path): + filebytes = get_html(url, return_type='content') + if isinstance(filebytes, bytes) and len(filebytes): + if len(filebytes) == save_path.open('wb').write(filebytes): + return str(save_path) + return _inner(*args) + +'''用法示例: 2线程同时下载两个不同文件,并保存到不同路径,路径目录可未创建,但需要具备对目标目录和文件的写权限 +parallel_download_files([ + ('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'), + ('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml') + ]) +''' +# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),) +# parallel: 并行下载的线程池线程数,为0则由函数自己决定 +def parallel_download_files(dn_list, parallel: int = 0): + mp_args = [] + for url, fullpath in dn_list: + if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)): + fullpath = Path(fullpath) + fullpath.parent.mkdir(parents=True, exist_ok=True) + mp_args.append((url, fullpath)) + if not len(mp_args): + return [] + if not isinstance(parallel, int) or parallel not in range(1,200): + parallel = min(5, len(mp_args)) + with ThreadPoolExecutor(parallel) as pool: + results = list(pool.map(download_one_file, mp_args)) + return results + def delete_all_elements_in_list(string,lists): new_lists = [] for i in lists: if i != string: new_lists.append(i) - return new_lists \ No newline at end of file + return new_lists diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index fa6b7f3..bf28a94 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -14,7 +14,7 @@ import config from datetime import datetime, timedelta import time from pathlib import Path -from ADC_function import file_modification_days, get_html +from ADC_function import file_modification_days, get_html, parallel_download_files from number_parser import get_number from core import core_main, moveFailedFolder @@ -473,18 +473,24 @@ def main(): if conf.update_check(): check_update(version) - # Download Mapping Table - if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')): - ADC_function.download_file_with_filename( + # Download Mapping Table, parallel version + down_map_tab = [] + actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml' + if not actor_xml.exists(): + down_map_tab.append(( "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml", - "mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc')) - print("[+] [1/2] Mapping Table Downloaded") - - if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')): - ADC_function.download_file_with_filename( + actor_xml)) + info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml' + if not info_xml.exists(): + down_map_tab.append(( "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml", - "mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc')) - print("[+] [2/2] Mapping Table Downloaded") + info_xml)) + res = parallel_download_files(down_map_tab) + for i, fp in enumerate(res, start=1): + if fp and len(fp): + print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}") + else: + print(f"[-] [{i}/{len(res)}] Mapping Table Download failed") print(f"[+]Load Config file '{conf.ini_path}'.") if conf.debug(): diff --git a/core.py b/core.py index 069c327..ebe47b7 100755 --- a/core.py +++ b/core.py @@ -9,7 +9,6 @@ from PIL import Image from io import BytesIO from pathlib import Path from datetime import datetime -from concurrent.futures import ThreadPoolExecutor from ADC_function import * from WebCrawler import get_data_from_json @@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath): if conf.debug(): print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') -def download_one_file(args): - def _inner(url: str, save_path: Path): - filebytes = get_html(url, return_type='content') - if isinstance(filebytes, bytes) and len(filebytes): - if len(filebytes) == save_path.open('wb').write(filebytes): - return str(save_path) - return _inner(*args) def extrafanart_download_threadpool(url_list, save_dir, number): tm_start = time.perf_counter() conf = config.getInstance() extrafanart_dir = Path(save_dir) / conf.get_extrafanart() download_only_missing_images = conf.download_only_missing_images() - mp_args = [] + dn_list = [] for i, url in enumerate(url_list, start=1): jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg' if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): continue - mp_args.append((url, jpg_fullpath)) - if not len(mp_args): + dn_list.append((url, jpg_fullpath)) + if not len(dn_list): return - extrafanart_dir.mkdir(parents=True, exist_ok=True) - parallel = min(len(mp_args), conf.extrafanart_thread_pool_download()) + parallel = min(len(dn_list), conf.extrafanart_thread_pool_download()) if parallel > 100: print('[!]Warrning: Parallel download thread too large may cause website ban IP!') - with ThreadPoolExecutor(parallel) as pool: - result = list(pool.map(download_one_file, mp_args)) + result = parallel_download_files(dn_list, parallel) failed = 0 for i, r in enumerate(result, start=1): if not r: @@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number): if conf.debug(): print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s') + # 封面是否下载成功,否则移动到failed def image_download(cover, number, leak_word, c_word, path, filepath): filename = f"{number}{leak_word}{c_word}-fanart.jpg"