From daedd3071c58407dafb52b855322fa7c6b85a0dd Mon Sep 17 00:00:00 2001 From: naughtyGitCat Date: Fri, 16 Sep 2022 17:18:17 +0800 Subject: [PATCH] PEP8 space line pretty PEP8 var name pretty add __main__ comment global and local var isolation --- ADC_function.py | 229 +++++++++++++++++++++++++----------------------- core.py | 2 +- scraper.py | 3 + xlog.py | 27 ++++-- 4 files changed, 143 insertions(+), 118 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 2219219..ad5871f 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,55 +1,57 @@ -from os import replace -import requests -# import hashlib -from pathlib import Path -import secrets +# build-in lib import os.path import os +import re import uuid import json import time -from lxml import etree -import re -import config import typing -from urllib.parse import urljoin -import mechanicalsoup -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry -from cloudscraper import create_scraper -from concurrent.futures import ThreadPoolExecutor from unicodedata import category +from concurrent.futures import ThreadPoolExecutor + +# third party lib +import requests +from requests.adapters import HTTPAdapter +import mechanicalsoup +from pathlib import Path +from urllib3.util.retry import Retry +from lxml import etree +from cloudscraper import create_scraper + +# project wide +import config -def getXpathSingle(htmlcode, xpath): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def get_xpath_single(html_code: str, xpath): + html = etree.fromstring(html_code, etree.HTMLParser()) result1 = str(html.xpath(xpath)).strip(" ['']") return result1 G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' -def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, json_headers = None): + +def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, json_headers=None): """ 网页请求核心函数 """ verify = config.getInstance().cacert_file() - configProxy = config.getInstance().proxy() + config_proxy = config.getInstance().proxy() errors = "" headers = {"User-Agent": ua or G_USER_AGENT} # noqa - if json_headers != None: + if json_headers is not None: headers.update(json_headers) - for i in range(configProxy.retry): + for i in range(config_proxy.retry): try: - if configProxy.enable: - proxies = configProxy.proxies() - result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies, + if config_proxy.enable: + proxies = config_proxy.proxies() + result = requests.get(str(url), headers=headers, timeout=config_proxy.timeout, proxies=proxies, verify=verify, cookies=cookies) else: - result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies) + result = requests.get(str(url), headers=headers, timeout=config_proxy.timeout, cookies=cookies) if return_type == "object": return result @@ -59,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, result.encoding = encoding or result.apparent_encoding return result.text except Exception as e: - print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry)) + print("[-]Connect retry {}/{}".format(i + 1, config_proxy.retry)) errors = str(e) if "getaddrinfo failed" in errors: print("[-]Connect Failed! Please Check your proxy config") @@ -71,8 +73,9 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, print('[-]Connect Failed! Please check your Proxy or Network!') raise Exception('Connect Failed') + def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: - configProxy = config.getInstance().proxy() + config_proxy = config.getInstance().proxy() errors = "" headers_ua = {"User-Agent": G_USER_AGENT} if headers is None: @@ -80,16 +83,16 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: else: headers.update(headers_ua) - for i in range(configProxy.retry): + for i in range(config_proxy.retry): try: - if configProxy.enable: - proxies = configProxy.proxies() - result = requests.post(url, data=query, proxies=proxies, headers=headers, timeout=configProxy.timeout) + if config_proxy.enable: + proxies = config_proxy.proxies() + result = requests.post(url, data=query, proxies=proxies, headers=headers, timeout=config_proxy.timeout) else: - result = requests.post(url, data=query, headers=headers, timeout=configProxy.timeout) + result = requests.post(url, data=query, headers=headers, timeout=config_proxy.timeout) return result except Exception as e: - print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry)) + print("[-]Connect retry {}/{}".format(i + 1, config_proxy.retry)) errors = str(e) print("[-]Connect Failed! Please check your Proxy or Network!") print("[-]" + errors) @@ -116,17 +119,17 @@ class TimeoutHTTPAdapter(HTTPAdapter): # with keep-alive feature def get_html_session(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): - configProxy = config.getInstance().proxy() + config_proxy = config.getInstance().proxy() session = requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(session.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) - session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - if configProxy.enable: + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: session.verify = config.getInstance().cacert_file() - session.proxies = configProxy.proxies() + session.proxies = config_proxy.proxies() headers = {"User-Agent": ua or G_USER_AGENT} session.headers = headers try: @@ -156,17 +159,17 @@ def get_html_session(url: str = None, cookies: dict = None, ua: str = None, retu def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, use_scraper: bool = False): - configProxy = config.getInstance().proxy() + config_proxy = config.getInstance().proxy() s = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) - s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - if configProxy.enable: + s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: s.verify = config.getInstance().cacert_file() - s.proxies = configProxy.proxies() + s.proxies = config_proxy.proxies() try: browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) if isinstance(url, str) and len(url): @@ -194,17 +197,17 @@ def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, r def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): - configProxy = config.getInstance().proxy() + config_proxy = config.getInstance().proxy() s = requests.Session() if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) - s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - if configProxy.enable: + s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: s.verify = config.getInstance().cacert_file() - s.proxies = configProxy.proxies() + s.proxies = config_proxy.proxies() try: browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) result = browser.open(url) @@ -234,17 +237,17 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): - configProxy = config.getInstance().proxy() + config_proxy = config.getInstance().proxy() session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if isinstance(cookies, dict) and len(cookies): requests.utils.add_dict_to_cookiejar(session.cookies, cookies) - retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, + retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) - session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) - if configProxy.enable: + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout)) + if config_proxy.enable: session.verify = config.getInstance().cacert_file() - session.proxies = configProxy.proxies() + session.proxies = config_proxy.proxies() try: if isinstance(url, str) and len(url): result = session.get(str(url)) @@ -415,55 +418,16 @@ def is_japanese(raw: str) -> bool: return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE)) -# Usage: python ./ADC_function.py https://cn.bing.com/ -if __name__ == "__main__": - import sys, timeit - from http.client import HTTPConnection - - - def benchmark(t, url): - print(f"HTTP GET Benchmark times:{t} url:{url}") - tm = timeit.timeit(f"_ = session1.get('{url}')", - "from __main__ import get_html_session;session1=get_html_session()", - number=t) - print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable') - tm = timeit.timeit(f"_ = scraper1.get('{url}')", - "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", - number=t) - print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable') - tm = timeit.timeit(f"_ = browser1.open('{url}')", - "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", - number=t) - print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable') - tm = timeit.timeit(f"_ = get_html('{url}')", - "from __main__ import get_html", - number=t) - print(f' *{tm:>10.5f}s get_html()') - - - t = 100 - - # url = "https://www.189.cn/" - url = "http://www.chinaunicom.com" - HTTPConnection.debuglevel = 1 - s = get_html_session() - _ = s.get(url) - HTTPConnection.debuglevel = 0 - if len(sys.argv) > 1: - url = sys.argv[1] - benchmark(t, url) - - def download_file_with_filename(url: str, filename: str, path: str) -> None: """ download file save to give path with given name from given url """ conf = config.getInstance() - configProxy = conf.proxy() + config_proxy = conf.proxy() - for i in range(configProxy.retry): + for i in range(config_proxy.retry): try: - if configProxy.enable: + if config_proxy.enable: if not os.path.exists(path): try: os.makedirs(path) @@ -491,18 +455,18 @@ def download_file_with_filename(url: str, filename: str, path: str) -> None: with open(os.path.join(path, filename), "wb") as code: code.write(r) return - except requests.exceptions.RequestException: - i += 1 - print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) - except requests.exceptions.ConnectionError: - i += 1 - print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) except requests.exceptions.ProxyError: i += 1 - print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) except requests.exceptions.ConnectTimeout: i += 1 - print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) + except requests.exceptions.ConnectionError: + i += 1 + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) + except requests.exceptions.RequestException: + i += 1 + print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry)) except IOError: raise ValueError(f"[-]Create Directory '{path}' failed!") return @@ -518,7 +482,7 @@ def download_one_file(args) -> str: """ (url, save_path, json_headers) = args - if json_headers != None: + if json_headers is not None: filebytes = get_html(url, return_type='content', json_headers=json_headers['headers']) else: filebytes = get_html(url, return_type='content') @@ -574,10 +538,57 @@ def delete_all_elements_in_str(string_delete: str, string: str): """ for i in string: if i == string_delete: - string = string.replace(i,"") + string = string.replace(i, "") return string # print format空格填充对齐内容包含中文时的空格计算 -def cnspace(v: str, n: int) -> int: +def cn_space(v: str, n: int) -> int: return n - [category(c) for c in v].count('Lo') + + +""" +Usage: python ./ADC_function.py https://cn.bing.com/ +Purpose: benchmark get_html_session + benchmark get_html_by_scraper + benchmark get_html_by_browser + benchmark get_html +TODO: may be this should move to unittest directory +""" +if __name__ == "__main__": + import sys, timeit + from http.client import HTTPConnection + + + def benchmark(times: int, url): + print(f"HTTP GET Benchmark times:{times} url:{url}") + tm = timeit.timeit(f"_ = session1.get('{url}')", + "from __main__ import get_html_session;session1=get_html_session()", + number=times) + print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable') + tm = timeit.timeit(f"_ = scraper1.get('{url}')", + "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", + number=times) + print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable') + tm = timeit.timeit(f"_ = browser1.open('{url}')", + "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", + number=times) + print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable') + tm = timeit.timeit(f"_ = get_html('{url}')", + "from __main__ import get_html", + number=times) + print(f' *{tm:>10.5f}s get_html()') + + + # target_url = "https://www.189.cn/" + target_url = "http://www.chinaunicom.com" + HTTPConnection.debuglevel = 1 + html_session = get_html_session() + _ = html_session.get(target_url) + HTTPConnection.debuglevel = 0 + + # times + t = 100 + if len(sys.argv) > 1: + target_url = sys.argv[1] + benchmark(t, target_url) diff --git a/core.py b/core.py index c8364ef..4a4b714 100644 --- a/core.py +++ b/core.py @@ -692,7 +692,7 @@ def debug_print(data: json): if i == 'extrafanart': print('[+] -', "%-19s" % i, ':', len(v), 'links') continue - print(f'[+] - {i:<{cnspace(i,19)}} : {v}') + print(f'[+] - {i:<{cn_space(i, 19)}} : {v}') print("[+] ------- DEBUG INFO -------") except: diff --git a/scraper.py b/scraper.py index f8bbc06..ab95e3f 100644 --- a/scraper.py +++ b/scraper.py @@ -7,6 +7,7 @@ from pathlib import Path from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate from scrapinglib.api import search + def get_data_from_json(file_number, oCC, specified_source, specified_url): """ iterate through all services and fetch the data 从JSON返回元数据 @@ -180,6 +181,7 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): if oCC: cc_vars = conf.cc_convert_vars().split(",") ccm = conf.cc_convert_mode() + def convert_list(mapping_data,language,vars): total = [] for i in vars: @@ -187,6 +189,7 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0] total.append(i) return total + def convert(mapping_data,language,vars): if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0: return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0] diff --git a/xlog.py b/xlog.py index 956a77b..a91423a 100755 --- a/xlog.py +++ b/xlog.py @@ -1,4 +1,3 @@ - import os import sys import time @@ -18,7 +17,8 @@ INFO = 20 DEBUG = 10 NOTSET = 0 -class Logger(): + +class Logger: def __init__(self, name, buffer_size=0, file_name=None, roll_num=1): self.err_color = '\033[0m' self.warn_color = '\033[0m' @@ -28,7 +28,7 @@ class Logger(): self.name = str(name) self.file_max_size = 1024 * 1024 self.buffer_lock = threading.Lock() - self.buffer = {} # id => line + self.buffer = {} # id => line self.buffer_size = buffer_size self.last_no = 0 self.min_level = NOTSET @@ -107,7 +107,7 @@ class Logger(): if not os.path.isfile(old_name): continue - #self.info("roll_log %s -> %s", old_name, new_name) + # self.info("roll_log %s -> %s", old_name, new_name) shutil.move(old_name, new_name) shutil.move(self.log_filename, self.log_filename + ".1") @@ -157,7 +157,8 @@ class Logger(): if buffer_len > self.buffer_size: del self.buffer[self.last_no - self.buffer_size] except Exception as e: - string = '%s - [%s]LOG_EXCEPT: %s, Except:%s
%s' % (time.ctime()[4:-5], level, fmt % args, e, traceback.format_exc()) + string = '%s - [%s]LOG_EXCEPT: %s, Except:%s
%s' % ( + time.ctime()[4:-5], level, fmt % args, e, traceback.format_exc()) self.last_no += 1 self.buffer[self.last_no] = string buffer_len = len(self.buffer) @@ -202,7 +203,7 @@ class Logger(): def tofile(self, fmt, *args, **kwargs): self.log_to_file('@', self.warn_color, fmt, *args, **kwargs) - #================================================================= + # ================================================================= def set_buffer_size(self, set_size): self.buffer_lock.acquire() self.buffer_size = set_size @@ -255,8 +256,10 @@ class Logger(): print(("Except stack:%s" % traceback.format_exc())) return "" + loggerDict = {} + def getLogger(name=None, buffer_size=0, file_name=None, roll_num=1): global loggerDict, default_log if name is None: @@ -279,29 +282,38 @@ def getLogger(name=None, buffer_size=0, file_name=None, roll_num=1): default_log = logger_instance return logger_instance + default_log = getLogger() + def debg(fmt, *args, **kwargs): default_log.debug(fmt, *args, **kwargs) + def info(fmt, *args, **kwargs): default_log.info(fmt, *args, **kwargs) + def warn(fmt, *args, **kwargs): default_log.warning(fmt, *args, **kwargs) + def erro(fmt, *args, **kwargs): default_log.error(fmt, *args, **kwargs) + def excp(fmt, *args, **kwargs): default_log.exception(fmt, *args, **kwargs) + def crit(fmt, *args, **kwargs): default_log.critical(fmt, *args, **kwargs) + def tofile(fmt, *args, **kwargs): default_log.tofile(fmt, *args, **kwargs) + if __name__ == '__main__': log_file = os.path.join(os.path.dirname(sys.argv[0]), "test.log") getLogger().set_file(log_file) @@ -313,7 +325,6 @@ if __name__ == '__main__': tofile("write to file only") try: - 1/0 + 1 / 0 except Exception as e: excp("An error has occurred") -