Merge pull request #631 from lededev/parallel-mapdown

parallel map download
This commit is contained in:
Yoshiko2
2021-11-06 23:18:55 +08:00
committed by GitHub
3 changed files with 56 additions and 27 deletions

View File

@@ -15,6 +15,7 @@ import mechanicalsoup
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from cloudscraper import create_scraper from cloudscraper import create_scraper
from concurrent.futures import ThreadPoolExecutor
def getXpathSingle(htmlcode, xpath): def getXpathSingle(htmlcode, xpath):
@@ -490,6 +491,37 @@ def download_file_with_filename(url, filename, path):
raise ValueError('[-]Connect Failed! Please check your Proxy or Network!') raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
return return
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
'''用法示例: 2线程同时下载两个不同文件并保存到不同路径路径目录可未创建但需要具备对目标目录和文件的写权限
parallel_download_files([
('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
])
'''
# dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
# parallel: 并行下载的线程池线程数为0则由函数自己决定
def parallel_download_files(dn_list, parallel: int = 0):
mp_args = []
for url, fullpath in dn_list:
if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
fullpath = Path(fullpath)
fullpath.parent.mkdir(parents=True, exist_ok=True)
mp_args.append((url, fullpath))
if not len(mp_args):
return []
if not isinstance(parallel, int) or parallel not in range(1,200):
parallel = min(5, len(mp_args))
with ThreadPoolExecutor(parallel) as pool:
results = list(pool.map(download_one_file, mp_args))
return results
def delete_all_elements_in_list(string,lists): def delete_all_elements_in_list(string,lists):
new_lists = [] new_lists = []
for i in lists: for i in lists:

View File

@@ -14,7 +14,7 @@ import config
from datetime import datetime, timedelta from datetime import datetime, timedelta
import time import time
from pathlib import Path from pathlib import Path
from ADC_function import file_modification_days, get_html from ADC_function import file_modification_days, get_html, parallel_download_files
from number_parser import get_number from number_parser import get_number
from core import core_main, moveFailedFolder from core import core_main, moveFailedFolder
@@ -473,18 +473,24 @@ def main():
if conf.update_check(): if conf.update_check():
check_update(version) check_update(version)
# Download Mapping Table # Download Mapping Table, parallel version
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')): down_map_tab = []
ADC_function.download_file_with_filename( actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
if not actor_xml.exists():
down_map_tab.append((
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml", "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
"mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc')) actor_xml))
print("[+] [1/2] Mapping Table Downloaded") info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
if not info_xml.exists():
if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')): down_map_tab.append((
ADC_function.download_file_with_filename(
"https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml", "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
"mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc')) info_xml))
print("[+] [2/2] Mapping Table Downloaded") res = parallel_download_files(down_map_tab)
for i, fp in enumerate(res, start=1):
if fp and len(fp):
print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
else:
print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
print(f"[+]Load Config file '{conf.ini_path}'.") print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug(): if conf.debug():

21
core.py
View File

@@ -9,7 +9,6 @@ from PIL import Image
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from ADC_function import * from ADC_function import *
from WebCrawler import get_data_from_json from WebCrawler import get_data_from_json
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
if conf.debug(): if conf.debug():
print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
def download_one_file(args):
def _inner(url: str, save_path: Path):
filebytes = get_html(url, return_type='content')
if isinstance(filebytes, bytes) and len(filebytes):
if len(filebytes) == save_path.open('wb').write(filebytes):
return str(save_path)
return _inner(*args)
def extrafanart_download_threadpool(url_list, save_dir, number): def extrafanart_download_threadpool(url_list, save_dir, number):
tm_start = time.perf_counter() tm_start = time.perf_counter()
conf = config.getInstance() conf = config.getInstance()
extrafanart_dir = Path(save_dir) / conf.get_extrafanart() extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
download_only_missing_images = conf.download_only_missing_images() download_only_missing_images = conf.download_only_missing_images()
mp_args = [] dn_list = []
for i, url in enumerate(url_list, start=1): for i, url in enumerate(url_list, start=1):
jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg' jpg_fullpath = extrafanart_dir / f'extrafanart-{i}.jpg'
if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
continue continue
mp_args.append((url, jpg_fullpath)) dn_list.append((url, jpg_fullpath))
if not len(mp_args): if not len(dn_list):
return return
extrafanart_dir.mkdir(parents=True, exist_ok=True) parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
if parallel > 100: if parallel > 100:
print('[!]Warrning: Parallel download thread too large may cause website ban IP!') print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
with ThreadPoolExecutor(parallel) as pool: result = parallel_download_files(dn_list, parallel)
result = list(pool.map(download_one_file, mp_args))
failed = 0 failed = 0
for i, r in enumerate(result, start=1): for i, r in enumerate(result, start=1):
if not r: if not r:
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
if conf.debug(): if conf.debug():
print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s') print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
# 封面是否下载成功否则移动到failed # 封面是否下载成功否则移动到failed
def image_download(cover, number, leak_word, c_word, path, filepath): def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg" filename = f"{number}{leak_word}{c_word}-fanart.jpg"