parallel map download
This commit is contained in:
@@ -15,6 +15,7 @@ import mechanicalsoup
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
from cloudscraper import create_scraper
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
|
||||
def getXpathSingle(htmlcode, xpath):
|
||||
@@ -827,4 +828,35 @@ def download_file_with_filename(url, filename, path):
|
||||
return
|
||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
|
||||
return
|
||||
return
|
||||
|
||||
def download_one_file(args):
|
||||
def _inner(url: str, save_path: Path):
|
||||
filebytes = get_html(url, return_type='content')
|
||||
if isinstance(filebytes, bytes) and len(filebytes):
|
||||
if len(filebytes) == save_path.open('wb').write(filebytes):
|
||||
return str(save_path)
|
||||
return _inner(*args)
|
||||
|
||||
'''用法示例: 2线程同时下载两个不同文件,并保存到不同路径,路径目录可未创建,但需要具备对目标目录和文件的写权限
|
||||
parallel_download_files([
|
||||
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
|
||||
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
|
||||
])
|
||||
'''
|
||||
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
|
||||
# parallel: 并行下载的线程池线程数,为0则由函数自己决定
|
||||
def parallel_download_files(dn_list, parallel: int = 0):
|
||||
mp_args = []
|
||||
for url, fullpath in dn_list:
|
||||
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
|
||||
fullpath = Path(fullpath)
|
||||
fullpath.parent.mkdir(parents=True, exist_ok=True)
|
||||
mp_args.append((url, fullpath))
|
||||
if not len(mp_args):
|
||||
return []
|
||||
if not isinstance(parallel, int) or parallel not in range(1,200):
|
||||
parallel = min(5, len(mp_args))
|
||||
with ThreadPoolExecutor(parallel) as pool:
|
||||
results = list(pool.map(download_one_file, mp_args))
|
||||
return results
|
||||
|
||||
@@ -14,7 +14,7 @@ import config
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
from pathlib import Path
|
||||
from ADC_function import file_modification_days, get_html
|
||||
from ADC_function import file_modification_days, get_html, parallel_download_files
|
||||
from number_parser import get_number
|
||||
from core import core_main, moveFailedFolder
|
||||
|
||||
@@ -473,18 +473,24 @@ def main():
|
||||
if conf.update_check():
|
||||
check_update(version)
|
||||
|
||||
# Download Mapping Table
|
||||
if not os.path.exists(str(Path.home() / logdir / 'mapping_actor.xml')):
|
||||
ADC_function.download_file_with_filename(
|
||||
# Download Mapping Table, parallel version
|
||||
down_map_tab = []
|
||||
actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
|
||||
if not actor_xml.exists():
|
||||
down_map_tab.append((
|
||||
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
|
||||
"mapping_actor.xml", str(Path.home() / logdir))
|
||||
print("[+] [1/2] Mapping Table Downloaded")
|
||||
|
||||
if not os.path.exists(str(Path.home() / logdir / 'mapping_info.xml')):
|
||||
ADC_function.download_file_with_filename(
|
||||
actor_xml))
|
||||
info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
|
||||
if not info_xml.exists():
|
||||
down_map_tab.append((
|
||||
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
|
||||
"mapping_info.xml", str(Path.home() / logdir))
|
||||
print("[+] [2/2] Mapping Table Downloaded")
|
||||
info_xml))
|
||||
res = parallel_download_files(down_map_tab)
|
||||
for i, fp in enumerate(res, start=1):
|
||||
if fp and len(fp):
|
||||
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
|
||||
else:
|
||||
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
|
||||
|
||||
print(f"[+]Load Config file '{conf.ini_path}'.")
|
||||
if conf.debug():
|
||||
|
||||
21
core.py
21
core.py
@@ -9,7 +9,6 @@ from PIL import Image
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from ADC_function import *
|
||||
from WebCrawler import get_data_from_json
|
||||
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
|
||||
if conf.debug():
|
||||
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
|
||||
|
||||
def download_one_file(args):
|
||||
def _inner(url: str, save_path: Path):
|
||||
filebytes = get_html(url, return_type='content')
|
||||
if isinstance(filebytes, bytes) and len(filebytes):
|
||||
if len(filebytes) == save_path.open('wb').write(filebytes):
|
||||
return str(save_path)
|
||||
return _inner(*args)
|
||||
|
||||
def extrafanart_download_threadpool(url_list, save_dir, number):
|
||||
tm_start = time.perf_counter()
|
||||
conf = config.getInstance()
|
||||
extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
|
||||
download_only_missing_images = conf.download_only_missing_images()
|
||||
mp_args = []
|
||||
dn_list = []
|
||||
for i, url in enumerate(url_list, start=1):
|
||||
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
|
||||
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
|
||||
continue
|
||||
mp_args.append((url, jpg_fullpath))
|
||||
if not len(mp_args):
|
||||
dn_list.append((url, jpg_fullpath))
|
||||
if not len(dn_list):
|
||||
return
|
||||
extrafanart_dir.mkdir(parents=True, exist_ok=True)
|
||||
parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
|
||||
parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
|
||||
if parallel > 100:
|
||||
print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
|
||||
with ThreadPoolExecutor(parallel) as pool:
|
||||
result = list(pool.map(download_one_file, mp_args))
|
||||
result = parallel_download_files(dn_list, parallel)
|
||||
failed = 0
|
||||
for i, r in enumerate(result, start=1):
|
||||
if not r:
|
||||
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
|
||||
if conf.debug():
|
||||
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
|
||||
|
||||
|
||||
# 封面是否下载成功,否则移动到failed
|
||||
def image_download(cover, number, leak_word, c_word, path, filepath):
|
||||
filename = f"{number}{leak_word}{c_word}-fanart.jpg"
|
||||
|
||||
Reference in New Issue
Block a user