Merge pull request #631 from lededev/parallel-mapdown

parallel map download
This commit is contained in:
Yoshiko2
2021-11-06 23:18:55 +08:00
committed by GitHub
3 changed files with 56 additions and 27 deletions

View File

@@ -15,6 +15,7 @@ import mechanicalsoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from cloudscraper import create_scraper
from concurrent.futures import ThreadPoolExecutor
def getXpathSingle(htmlcode, xpath):
@@ -490,9 +491,40 @@ def download_file_with_filename(url, filename, path):
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
return
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
'''用法示例: 2线程同时下载两个不同文件并保存到不同路径路径目录可未创建但需要具备对目标目录和文件的写权限
parallel_download_files([
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
])
'''
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
# parallel: 并行下载的线程池线程数为0则由函数自己决定
def parallel_download_files(dn_list, parallel: int = 0):
mp_args = []
for url, fullpath in dn_list:
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
fullpath = Path(fullpath)
fullpath.parent.mkdir(parents=True, exist_ok=True)
mp_args.append((url, fullpath))
if not len(mp_args):
return []
if not isinstance(parallel, int) or parallel not in range(1,200):
parallel = min(5, len(mp_args))
with ThreadPoolExecutor(parallel) as pool:
results = list(pool.map(download_one_file, mp_args))
return results
def delete_all_elements_in_list(string,lists):
new_lists = []
for i in lists:
if i != string:
new_lists.append(i)
return new_lists
return new_lists

View File

@@ -14,7 +14,7 @@ import config
from datetime import datetime, timedelta
import time
from pathlib import Path
from ADC_function import file_modification_days, get_html
from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number
from core import core_main, moveFailedFolder
@@ -473,18 +473,24 @@ def main():
if conf.update_check():
check_update(version)
# Download Mapping Table
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')):
ADC_function.download_file_with_filename(
# Download Mapping Table, parallel version
down_map_tab = []
actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
if not actor_xml.exists():
down_map_tab.append((
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
"mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
print("[+] [1/2] Mapping Table Downloaded")
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')):
ADC_function.download_file_with_filename(
actor_xml))
info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
if not info_xml.exists():
down_map_tab.append((
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
"mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
print("[+] [2/2] Mapping Table Downloaded")
info_xml))
res = parallel_download_files(down_map_tab)
for i, fp in enumerate(res, start=1):
if fp and len(fp):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
else:
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug():

21
core.py
View File

@@ -9,7 +9,6 @@ from PIL import Image
from io import BytesIO
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from ADC_function import *
from WebCrawler import get_data_from_json
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
if conf.debug():
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
def extrafanart_download_threadpool(url_list, save_dir, number):
tm_start = time.perf_counter()
conf = config.getInstance()
extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
download_only_missing_images = conf.download_only_missing_images()
mp_args = []
dn_list = []
for i, url in enumerate(url_list, start=1):
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
continue
mp_args.append((url, jpg_fullpath))
if not len(mp_args):
dn_list.append((url, jpg_fullpath))
if not len(dn_list):
return
extrafanart_dir.mkdir(parents=True, exist_ok=True)
parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
if parallel > 100:
print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
with ThreadPoolExecutor(parallel) as pool:
result = list(pool.map(download_one_file, mp_args))
result = parallel_download_files(dn_list, parallel)
failed = 0
for i, r in enumerate(result, start=1):
if not r:
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
if conf.debug():
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
# 封面是否下载成功否则移动到failed
def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg"