Merge pull request #631 from lededev/parallel-mapdown

parallel map download
2021-11-06 23:18:55 +08:00
parent b2177ce256 06297949da
commit 59b554ee80
3 changed files with 56 additions and 27 deletions
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -15,6 +15,7 @@ import mechanicalsoup
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from cloudscraper import create_scraper
 from concurrent.futures import ThreadPoolExecutor
 def getXpathSingle(htmlcode, xpath):
@@ -490,6 +491,37 @@ def download_file_with_filename(url, filename, path):
    raise ValueError('[-]Connect Failed! Please check your Proxy or Network!')
    return
 def download_one_file(args):
    def _inner(url: str, save_path: Path):
        filebytes = get_html(url, return_type='content')
        if isinstance(filebytes, bytes) and len(filebytes):
            if len(filebytes) == save_path.open('wb').write(filebytes):
                return str(save_path)
    return _inner(*args)
 '''用法示例: 2线程同时下载两个不同文件，并保存到不同路径，路径目录可未创建，但需要具备对目标目录和文件的写权限
 parallel_download_files([
    ('https://site1/img/p1.jpg', 'C:/temp/img/p1.jpg'),
    ('https://site2/cover/n1.xml', 'C:/tmp/cover/n1.xml')
    ])
 '''
 # dn_list 可以是 tuple或者list: ((url1, save_fullpath1),(url2, save_fullpath2),)
 # parallel: 并行下载的线程池线程数，为0则由函数自己决定
 def parallel_download_files(dn_list, parallel: int = 0):
    mp_args = []
    for url, fullpath in dn_list:
        if url and isinstance(url, str) and url.startswith('http') and fullpath and isinstance(fullpath, (str, Path)) and len(str(fullpath)):
            fullpath = Path(fullpath)
            fullpath.parent.mkdir(parents=True, exist_ok=True)
            mp_args.append((url, fullpath))
    if not len(mp_args):
        return []
    if not isinstance(parallel, int) or parallel not in range(1,200):
        parallel = min(5, len(mp_args))
    with ThreadPoolExecutor(parallel) as pool:
        results = list(pool.map(download_one_file, mp_args))
    return results
 def delete_all_elements_in_list(string,lists):
    new_lists = []
    for i in lists:
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -14,7 +14,7 @@ import config
 from datetime import datetime, timedelta
 import time
 from pathlib import Path
-from ADC_function import  file_modification_days, get_html
+from ADC_function import  file_modification_days, get_html, parallel_download_files
 from number_parser import get_number
 from core import core_main, moveFailedFolder
@@ -473,18 +473,24 @@ def main():
    if conf.update_check():
        check_update(version)
-    # Download Mapping Table
+    # Download Mapping Table, parallel version
-    if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml')):
+    down_map_tab = []
-        ADC_function.download_file_with_filename(
+    actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml'
    if not actor_xml.exists():
        down_map_tab.append((
            "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml",
-            "mapping_actor.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
+            actor_xml))
-        print("[+] [1/2] Mapping Table Downloaded")
+    info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml'
-
+    if not info_xml.exists():
-    if not os.path.exists(str(Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml')):
+        down_map_tab.append((
        ADC_function.download_file_with_filename(
            "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml",
-            "mapping_info.xml", str(Path.home() / '.local' / 'share' / 'avdc'))
+            info_xml))
-        print("[+] [2/2] Mapping Table Downloaded")
+    res = parallel_download_files(down_map_tab)
    for i, fp in enumerate(res, start=1):
        if fp and len(fp):
            print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}")
        else:
            print(f"[-] [{i}/{len(res)}] Mapping Table Download failed")
    print(f"[+]Load Config file '{conf.ini_path}'.")
    if conf.debug():
--- a/core.py
+++ b/core.py
@@ -9,7 +9,6 @@ from PIL import Image
 from io import BytesIO
 from pathlib import Path
 from datetime import datetime
 from concurrent.futures import ThreadPoolExecutor
 from ADC_function import *
 from WebCrawler import get_data_from_json
@@ -216,33 +215,24 @@ def extrafanart_download_one_by_one(data, path, filepath):
    if conf.debug():
        print(f'[!]Extrafanart download one by one mode runtime {time.perf_counter() - tm_start:.3f}s')
 def download_one_file(args):
    def _inner(url: str, save_path: Path):
        filebytes = get_html(url, return_type='content')
        if isinstance(filebytes, bytes) and len(filebytes):
            if len(filebytes) == save_path.open('wb').write(filebytes):
                return str(save_path)
    return _inner(*args)
 def extrafanart_download_threadpool(url_list, save_dir, number):
    tm_start = time.perf_counter()
    conf = config.getInstance()
    extrafanart_dir = Path(save_dir) / conf.get_extrafanart()
    download_only_missing_images = conf.download_only_missing_images()
-    mp_args = []
+    dn_list = []
    for i, url in enumerate(url_list, start=1):
        jpg_fullpath = extrafanart_dir /  f'extrafanart-{i}.jpg'
        if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
            continue
-        mp_args.append((url, jpg_fullpath))
+        dn_list.append((url, jpg_fullpath))
-    if not len(mp_args):
+    if not len(dn_list):
        return
-    extrafanart_dir.mkdir(parents=True, exist_ok=True)
+    parallel = min(len(dn_list), conf.extrafanart_thread_pool_download())
    parallel = min(len(mp_args), conf.extrafanart_thread_pool_download())
    if parallel > 100:
        print('[!]Warrning: Parallel download thread too large may cause website ban IP!')
-    with ThreadPoolExecutor(parallel) as pool:
+    result = parallel_download_files(dn_list, parallel)
        result = list(pool.map(download_one_file, mp_args))
    failed = 0
    for i, r in enumerate(result, start=1):
        if not r:
@@ -255,6 +245,7 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
    if conf.debug():
        print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')
 # 封面是否下载成功，否则移动到failed
 def image_download(cover, number, leak_word, c_word, path, filepath):
    filename = f"{number}{leak_word}{c_word}-fanart.jpg"