PEP8 space line pretty
PEP8 var name pretty add __main__ comment global and local var isolation
This commit is contained in:
225
ADC_function.py
225
ADC_function.py
@@ -1,55 +1,57 @@
|
|||||||
from os import replace
|
# build-in lib
|
||||||
import requests
|
|
||||||
# import hashlib
|
|
||||||
from pathlib import Path
|
|
||||||
import secrets
|
|
||||||
import os.path
|
import os.path
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import uuid
|
import uuid
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from lxml import etree
|
|
||||||
import re
|
|
||||||
import config
|
|
||||||
import typing
|
import typing
|
||||||
from urllib.parse import urljoin
|
|
||||||
import mechanicalsoup
|
|
||||||
from requests.adapters import HTTPAdapter
|
|
||||||
from urllib3.util.retry import Retry
|
|
||||||
from cloudscraper import create_scraper
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
from unicodedata import category
|
from unicodedata import category
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
# third party lib
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
import mechanicalsoup
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
from lxml import etree
|
||||||
|
from cloudscraper import create_scraper
|
||||||
|
|
||||||
|
# project wide
|
||||||
|
import config
|
||||||
|
|
||||||
|
|
||||||
def getXpathSingle(htmlcode, xpath):
|
def get_xpath_single(html_code: str, xpath):
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
html = etree.fromstring(html_code, etree.HTMLParser())
|
||||||
result1 = str(html.xpath(xpath)).strip(" ['']")
|
result1 = str(html.xpath(xpath)).strip(" ['']")
|
||||||
return result1
|
return result1
|
||||||
|
|
||||||
|
|
||||||
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
|
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
|
||||||
|
|
||||||
|
|
||||||
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, json_headers=None):
|
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, json_headers=None):
|
||||||
"""
|
"""
|
||||||
网页请求核心函数
|
网页请求核心函数
|
||||||
"""
|
"""
|
||||||
verify = config.getInstance().cacert_file()
|
verify = config.getInstance().cacert_file()
|
||||||
configProxy = config.getInstance().proxy()
|
config_proxy = config.getInstance().proxy()
|
||||||
errors = ""
|
errors = ""
|
||||||
|
|
||||||
headers = {"User-Agent": ua or G_USER_AGENT} # noqa
|
headers = {"User-Agent": ua or G_USER_AGENT} # noqa
|
||||||
if json_headers != None:
|
if json_headers is not None:
|
||||||
headers.update(json_headers)
|
headers.update(json_headers)
|
||||||
|
|
||||||
for i in range(configProxy.retry):
|
for i in range(config_proxy.retry):
|
||||||
try:
|
try:
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
proxies = configProxy.proxies()
|
proxies = config_proxy.proxies()
|
||||||
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies,
|
result = requests.get(str(url), headers=headers, timeout=config_proxy.timeout, proxies=proxies,
|
||||||
verify=verify,
|
verify=verify,
|
||||||
cookies=cookies)
|
cookies=cookies)
|
||||||
else:
|
else:
|
||||||
result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
|
result = requests.get(str(url), headers=headers, timeout=config_proxy.timeout, cookies=cookies)
|
||||||
|
|
||||||
if return_type == "object":
|
if return_type == "object":
|
||||||
return result
|
return result
|
||||||
@@ -59,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
|
|||||||
result.encoding = encoding or result.apparent_encoding
|
result.encoding = encoding or result.apparent_encoding
|
||||||
return result.text
|
return result.text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry))
|
print("[-]Connect retry {}/{}".format(i + 1, config_proxy.retry))
|
||||||
errors = str(e)
|
errors = str(e)
|
||||||
if "getaddrinfo failed" in errors:
|
if "getaddrinfo failed" in errors:
|
||||||
print("[-]Connect Failed! Please Check your proxy config")
|
print("[-]Connect Failed! Please Check your proxy config")
|
||||||
@@ -71,8 +73,9 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
|
|||||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||||
raise Exception('Connect Failed')
|
raise Exception('Connect Failed')
|
||||||
|
|
||||||
|
|
||||||
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
||||||
configProxy = config.getInstance().proxy()
|
config_proxy = config.getInstance().proxy()
|
||||||
errors = ""
|
errors = ""
|
||||||
headers_ua = {"User-Agent": G_USER_AGENT}
|
headers_ua = {"User-Agent": G_USER_AGENT}
|
||||||
if headers is None:
|
if headers is None:
|
||||||
@@ -80,16 +83,16 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
|||||||
else:
|
else:
|
||||||
headers.update(headers_ua)
|
headers.update(headers_ua)
|
||||||
|
|
||||||
for i in range(configProxy.retry):
|
for i in range(config_proxy.retry):
|
||||||
try:
|
try:
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
proxies = configProxy.proxies()
|
proxies = config_proxy.proxies()
|
||||||
result = requests.post(url, data=query, proxies=proxies, headers=headers, timeout=configProxy.timeout)
|
result = requests.post(url, data=query, proxies=proxies, headers=headers, timeout=config_proxy.timeout)
|
||||||
else:
|
else:
|
||||||
result = requests.post(url, data=query, headers=headers, timeout=configProxy.timeout)
|
result = requests.post(url, data=query, headers=headers, timeout=config_proxy.timeout)
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry))
|
print("[-]Connect retry {}/{}".format(i + 1, config_proxy.retry))
|
||||||
errors = str(e)
|
errors = str(e)
|
||||||
print("[-]Connect Failed! Please check your Proxy or Network!")
|
print("[-]Connect Failed! Please check your Proxy or Network!")
|
||||||
print("[-]" + errors)
|
print("[-]" + errors)
|
||||||
@@ -116,17 +119,17 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|||||||
# with keep-alive feature
|
# with keep-alive feature
|
||||||
def get_html_session(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
def get_html_session(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
||||||
encoding: str = None):
|
encoding: str = None):
|
||||||
configProxy = config.getInstance().proxy()
|
config_proxy = config.getInstance().proxy()
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
|
retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1,
|
||||||
status_forcelist=[429, 500, 502, 503, 504])
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
session.verify = config.getInstance().cacert_file()
|
session.verify = config.getInstance().cacert_file()
|
||||||
session.proxies = configProxy.proxies()
|
session.proxies = config_proxy.proxies()
|
||||||
headers = {"User-Agent": ua or G_USER_AGENT}
|
headers = {"User-Agent": ua or G_USER_AGENT}
|
||||||
session.headers = headers
|
session.headers = headers
|
||||||
try:
|
try:
|
||||||
@@ -156,17 +159,17 @@ def get_html_session(url: str = None, cookies: dict = None, ua: str = None, retu
|
|||||||
|
|
||||||
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
||||||
encoding: str = None, use_scraper: bool = False):
|
encoding: str = None, use_scraper: bool = False):
|
||||||
configProxy = config.getInstance().proxy()
|
config_proxy = config.getInstance().proxy()
|
||||||
s = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
|
s = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||||
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
|
retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1,
|
||||||
status_forcelist=[429, 500, 502, 503, 504])
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
s.verify = config.getInstance().cacert_file()
|
s.verify = config.getInstance().cacert_file()
|
||||||
s.proxies = configProxy.proxies()
|
s.proxies = config_proxy.proxies()
|
||||||
try:
|
try:
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
||||||
if isinstance(url, str) and len(url):
|
if isinstance(url, str) and len(url):
|
||||||
@@ -194,17 +197,17 @@ def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, r
|
|||||||
|
|
||||||
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
|
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
|
||||||
return_type: str = None, encoding: str = None):
|
return_type: str = None, encoding: str = None):
|
||||||
configProxy = config.getInstance().proxy()
|
config_proxy = config.getInstance().proxy()
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
|
||||||
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
|
retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1,
|
||||||
status_forcelist=[429, 500, 502, 503, 504])
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
s.verify = config.getInstance().cacert_file()
|
s.verify = config.getInstance().cacert_file()
|
||||||
s.proxies = configProxy.proxies()
|
s.proxies = config_proxy.proxies()
|
||||||
try:
|
try:
|
||||||
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s)
|
||||||
result = browser.open(url)
|
result = browser.open(url)
|
||||||
@@ -234,17 +237,17 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies:
|
|||||||
|
|
||||||
def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
||||||
encoding: str = None):
|
encoding: str = None):
|
||||||
configProxy = config.getInstance().proxy()
|
config_proxy = config.getInstance().proxy()
|
||||||
session = create_scraper(browser={'custom': ua or G_USER_AGENT, })
|
session = create_scraper(browser={'custom': ua or G_USER_AGENT, })
|
||||||
if isinstance(cookies, dict) and len(cookies):
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1,
|
retries = Retry(total=config_proxy.retry, connect=config_proxy.retry, backoff_factor=1,
|
||||||
status_forcelist=[429, 500, 502, 503, 504])
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout))
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=config_proxy.timeout))
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
session.verify = config.getInstance().cacert_file()
|
session.verify = config.getInstance().cacert_file()
|
||||||
session.proxies = configProxy.proxies()
|
session.proxies = config_proxy.proxies()
|
||||||
try:
|
try:
|
||||||
if isinstance(url, str) and len(url):
|
if isinstance(url, str) and len(url):
|
||||||
result = session.get(str(url))
|
result = session.get(str(url))
|
||||||
@@ -415,55 +418,16 @@ def is_japanese(raw: str) -> bool:
|
|||||||
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE))
|
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE))
|
||||||
|
|
||||||
|
|
||||||
# Usage: python ./ADC_function.py https://cn.bing.com/
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import sys, timeit
|
|
||||||
from http.client import HTTPConnection
|
|
||||||
|
|
||||||
|
|
||||||
def benchmark(t, url):
|
|
||||||
print(f"HTTP GET Benchmark times:{t} url:{url}")
|
|
||||||
tm = timeit.timeit(f"_ = session1.get('{url}')",
|
|
||||||
"from __main__ import get_html_session;session1=get_html_session()",
|
|
||||||
number=t)
|
|
||||||
print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable')
|
|
||||||
tm = timeit.timeit(f"_ = scraper1.get('{url}')",
|
|
||||||
"from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()",
|
|
||||||
number=t)
|
|
||||||
print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable')
|
|
||||||
tm = timeit.timeit(f"_ = browser1.open('{url}')",
|
|
||||||
"from __main__ import get_html_by_browser;browser1=get_html_by_browser()",
|
|
||||||
number=t)
|
|
||||||
print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable')
|
|
||||||
tm = timeit.timeit(f"_ = get_html('{url}')",
|
|
||||||
"from __main__ import get_html",
|
|
||||||
number=t)
|
|
||||||
print(f' *{tm:>10.5f}s get_html()')
|
|
||||||
|
|
||||||
|
|
||||||
t = 100
|
|
||||||
|
|
||||||
# url = "https://www.189.cn/"
|
|
||||||
url = "http://www.chinaunicom.com"
|
|
||||||
HTTPConnection.debuglevel = 1
|
|
||||||
s = get_html_session()
|
|
||||||
_ = s.get(url)
|
|
||||||
HTTPConnection.debuglevel = 0
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
url = sys.argv[1]
|
|
||||||
benchmark(t, url)
|
|
||||||
|
|
||||||
|
|
||||||
def download_file_with_filename(url: str, filename: str, path: str) -> None:
|
def download_file_with_filename(url: str, filename: str, path: str) -> None:
|
||||||
"""
|
"""
|
||||||
download file save to give path with given name from given url
|
download file save to give path with given name from given url
|
||||||
"""
|
"""
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
configProxy = conf.proxy()
|
config_proxy = conf.proxy()
|
||||||
|
|
||||||
for i in range(configProxy.retry):
|
for i in range(config_proxy.retry):
|
||||||
try:
|
try:
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
try:
|
try:
|
||||||
os.makedirs(path)
|
os.makedirs(path)
|
||||||
@@ -491,18 +455,18 @@ def download_file_with_filename(url: str, filename: str, path: str) -> None:
|
|||||||
with open(os.path.join(path, filename), "wb") as code:
|
with open(os.path.join(path, filename), "wb") as code:
|
||||||
code.write(r)
|
code.write(r)
|
||||||
return
|
return
|
||||||
except requests.exceptions.RequestException:
|
|
||||||
i += 1
|
|
||||||
print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
|
|
||||||
except requests.exceptions.ConnectionError:
|
|
||||||
i += 1
|
|
||||||
print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
|
|
||||||
except requests.exceptions.ProxyError:
|
except requests.exceptions.ProxyError:
|
||||||
i += 1
|
i += 1
|
||||||
print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
|
print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry))
|
||||||
except requests.exceptions.ConnectTimeout:
|
except requests.exceptions.ConnectTimeout:
|
||||||
i += 1
|
i += 1
|
||||||
print('[-]Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
|
print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry))
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
i += 1
|
||||||
|
print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry))
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
i += 1
|
||||||
|
print('[-]Download : Connect retry ' + str(i) + '/' + str(config_proxy.retry))
|
||||||
except IOError:
|
except IOError:
|
||||||
raise ValueError(f"[-]Create Directory '{path}' failed!")
|
raise ValueError(f"[-]Create Directory '{path}' failed!")
|
||||||
return
|
return
|
||||||
@@ -518,7 +482,7 @@ def download_one_file(args) -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
(url, save_path, json_headers) = args
|
(url, save_path, json_headers) = args
|
||||||
if json_headers != None:
|
if json_headers is not None:
|
||||||
filebytes = get_html(url, return_type='content', json_headers=json_headers['headers'])
|
filebytes = get_html(url, return_type='content', json_headers=json_headers['headers'])
|
||||||
else:
|
else:
|
||||||
filebytes = get_html(url, return_type='content')
|
filebytes = get_html(url, return_type='content')
|
||||||
@@ -579,5 +543,52 @@ def delete_all_elements_in_str(string_delete: str, string: str):
|
|||||||
|
|
||||||
|
|
||||||
# print format空格填充对齐内容包含中文时的空格计算
|
# print format空格填充对齐内容包含中文时的空格计算
|
||||||
def cnspace(v: str, n: int) -> int:
|
def cn_space(v: str, n: int) -> int:
|
||||||
return n - [category(c) for c in v].count('Lo')
|
return n - [category(c) for c in v].count('Lo')
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Usage: python ./ADC_function.py https://cn.bing.com/
|
||||||
|
Purpose: benchmark get_html_session
|
||||||
|
benchmark get_html_by_scraper
|
||||||
|
benchmark get_html_by_browser
|
||||||
|
benchmark get_html
|
||||||
|
TODO: may be this should move to unittest directory
|
||||||
|
"""
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys, timeit
|
||||||
|
from http.client import HTTPConnection
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark(times: int, url):
|
||||||
|
print(f"HTTP GET Benchmark times:{times} url:{url}")
|
||||||
|
tm = timeit.timeit(f"_ = session1.get('{url}')",
|
||||||
|
"from __main__ import get_html_session;session1=get_html_session()",
|
||||||
|
number=times)
|
||||||
|
print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable')
|
||||||
|
tm = timeit.timeit(f"_ = scraper1.get('{url}')",
|
||||||
|
"from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()",
|
||||||
|
number=times)
|
||||||
|
print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable')
|
||||||
|
tm = timeit.timeit(f"_ = browser1.open('{url}')",
|
||||||
|
"from __main__ import get_html_by_browser;browser1=get_html_by_browser()",
|
||||||
|
number=times)
|
||||||
|
print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable')
|
||||||
|
tm = timeit.timeit(f"_ = get_html('{url}')",
|
||||||
|
"from __main__ import get_html",
|
||||||
|
number=times)
|
||||||
|
print(f' *{tm:>10.5f}s get_html()')
|
||||||
|
|
||||||
|
|
||||||
|
# target_url = "https://www.189.cn/"
|
||||||
|
target_url = "http://www.chinaunicom.com"
|
||||||
|
HTTPConnection.debuglevel = 1
|
||||||
|
html_session = get_html_session()
|
||||||
|
_ = html_session.get(target_url)
|
||||||
|
HTTPConnection.debuglevel = 0
|
||||||
|
|
||||||
|
# times
|
||||||
|
t = 100
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
target_url = sys.argv[1]
|
||||||
|
benchmark(t, target_url)
|
||||||
|
|||||||
2
core.py
2
core.py
@@ -692,7 +692,7 @@ def debug_print(data: json):
|
|||||||
if i == 'extrafanart':
|
if i == 'extrafanart':
|
||||||
print('[+] -', "%-19s" % i, ':', len(v), 'links')
|
print('[+] -', "%-19s" % i, ':', len(v), 'links')
|
||||||
continue
|
continue
|
||||||
print(f'[+] - {i:<{cnspace(i,19)}} : {v}')
|
print(f'[+] - {i:<{cn_space(i, 19)}} : {v}')
|
||||||
|
|
||||||
print("[+] ------- DEBUG INFO -------")
|
print("[+] ------- DEBUG INFO -------")
|
||||||
except:
|
except:
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
|||||||
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
|
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
|
||||||
from scrapinglib.api import search
|
from scrapinglib.api import search
|
||||||
|
|
||||||
|
|
||||||
def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
||||||
"""
|
"""
|
||||||
iterate through all services and fetch the data 从JSON返回元数据
|
iterate through all services and fetch the data 从JSON返回元数据
|
||||||
@@ -180,6 +181,7 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
if oCC:
|
if oCC:
|
||||||
cc_vars = conf.cc_convert_vars().split(",")
|
cc_vars = conf.cc_convert_vars().split(",")
|
||||||
ccm = conf.cc_convert_mode()
|
ccm = conf.cc_convert_mode()
|
||||||
|
|
||||||
def convert_list(mapping_data,language,vars):
|
def convert_list(mapping_data,language,vars):
|
||||||
total = []
|
total = []
|
||||||
for i in vars:
|
for i in vars:
|
||||||
@@ -187,6 +189,7 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
|
i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
|
||||||
total.append(i)
|
total.append(i)
|
||||||
return total
|
return total
|
||||||
|
|
||||||
def convert(mapping_data,language,vars):
|
def convert(mapping_data,language,vars):
|
||||||
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
|
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
|
||||||
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
|
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
|
||||||
|
|||||||
19
xlog.py
19
xlog.py
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
@@ -18,7 +17,8 @@ INFO = 20
|
|||||||
DEBUG = 10
|
DEBUG = 10
|
||||||
NOTSET = 0
|
NOTSET = 0
|
||||||
|
|
||||||
class Logger():
|
|
||||||
|
class Logger:
|
||||||
def __init__(self, name, buffer_size=0, file_name=None, roll_num=1):
|
def __init__(self, name, buffer_size=0, file_name=None, roll_num=1):
|
||||||
self.err_color = '\033[0m'
|
self.err_color = '\033[0m'
|
||||||
self.warn_color = '\033[0m'
|
self.warn_color = '\033[0m'
|
||||||
@@ -157,7 +157,8 @@ class Logger():
|
|||||||
if buffer_len > self.buffer_size:
|
if buffer_len > self.buffer_size:
|
||||||
del self.buffer[self.last_no - self.buffer_size]
|
del self.buffer[self.last_no - self.buffer_size]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
string = '%s - [%s]LOG_EXCEPT: %s, Except:%s<br> %s' % (time.ctime()[4:-5], level, fmt % args, e, traceback.format_exc())
|
string = '%s - [%s]LOG_EXCEPT: %s, Except:%s<br> %s' % (
|
||||||
|
time.ctime()[4:-5], level, fmt % args, e, traceback.format_exc())
|
||||||
self.last_no += 1
|
self.last_no += 1
|
||||||
self.buffer[self.last_no] = string
|
self.buffer[self.last_no] = string
|
||||||
buffer_len = len(self.buffer)
|
buffer_len = len(self.buffer)
|
||||||
@@ -255,8 +256,10 @@ class Logger():
|
|||||||
print(("Except stack:%s" % traceback.format_exc()))
|
print(("Except stack:%s" % traceback.format_exc()))
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
loggerDict = {}
|
loggerDict = {}
|
||||||
|
|
||||||
|
|
||||||
def getLogger(name=None, buffer_size=0, file_name=None, roll_num=1):
|
def getLogger(name=None, buffer_size=0, file_name=None, roll_num=1):
|
||||||
global loggerDict, default_log
|
global loggerDict, default_log
|
||||||
if name is None:
|
if name is None:
|
||||||
@@ -279,29 +282,38 @@ def getLogger(name=None, buffer_size=0, file_name=None, roll_num=1):
|
|||||||
default_log = logger_instance
|
default_log = logger_instance
|
||||||
return logger_instance
|
return logger_instance
|
||||||
|
|
||||||
|
|
||||||
default_log = getLogger()
|
default_log = getLogger()
|
||||||
|
|
||||||
|
|
||||||
def debg(fmt, *args, **kwargs):
|
def debg(fmt, *args, **kwargs):
|
||||||
default_log.debug(fmt, *args, **kwargs)
|
default_log.debug(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def info(fmt, *args, **kwargs):
|
def info(fmt, *args, **kwargs):
|
||||||
default_log.info(fmt, *args, **kwargs)
|
default_log.info(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def warn(fmt, *args, **kwargs):
|
def warn(fmt, *args, **kwargs):
|
||||||
default_log.warning(fmt, *args, **kwargs)
|
default_log.warning(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def erro(fmt, *args, **kwargs):
|
def erro(fmt, *args, **kwargs):
|
||||||
default_log.error(fmt, *args, **kwargs)
|
default_log.error(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def excp(fmt, *args, **kwargs):
|
def excp(fmt, *args, **kwargs):
|
||||||
default_log.exception(fmt, *args, **kwargs)
|
default_log.exception(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def crit(fmt, *args, **kwargs):
|
def crit(fmt, *args, **kwargs):
|
||||||
default_log.critical(fmt, *args, **kwargs)
|
default_log.critical(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def tofile(fmt, *args, **kwargs):
|
def tofile(fmt, *args, **kwargs):
|
||||||
default_log.tofile(fmt, *args, **kwargs)
|
default_log.tofile(fmt, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
log_file = os.path.join(os.path.dirname(sys.argv[0]), "test.log")
|
log_file = os.path.join(os.path.dirname(sys.argv[0]), "test.log")
|
||||||
getLogger().set_file(log_file)
|
getLogger().set_file(log_file)
|
||||||
@@ -316,4 +328,3 @@ if __name__ == '__main__':
|
|||||||
1 / 0
|
1 / 0
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
excp("An error has occurred")
|
excp("An error has occurred")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user