From 921f926056da150741b07902a69653f5fc80504f Mon Sep 17 00:00:00 2001 From: lededev Date: Sat, 23 Oct 2021 05:21:08 +0800 Subject: [PATCH] ADC_function.py: add browser retry and timeout --- ADC_function.py | 50 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 36be657..4da8909 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -12,6 +12,8 @@ import re import config from urllib.parse import urljoin import mechanicalsoup +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry def getXpathSingle(htmlcode, xpath): @@ -28,10 +30,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None) configProxy = config.getInstance().proxy() errors = "" - if ua is None: - headers = {"User-Agent": G_USER_AGENT} # noqa - else: - headers = {"User-Agent": ua} + headers = {"User-Agent": ua or G_USER_AGENT} # noqa for i in range(configProxy.retry): try: @@ -84,15 +83,32 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: print("[-]" + errors) +G_DEFAULT_TIMEOUT = 10 # seconds + +class TimeoutHTTPAdapter(HTTPAdapter): + def __init__(self, *args, **kwargs): + self.timeout = G_DEFAULT_TIMEOUT + if "timeout" in kwargs: + self.timeout = kwargs["timeout"] + del kwargs["timeout"] + super().__init__(*args, **kwargs) + def send(self, request, **kwargs): + timeout = kwargs.get("timeout") + if timeout is None: + kwargs["timeout"] = self.timeout + return super().send(request, **kwargs) + def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): - s = None - if isinstance(cookies, dict) and len(cookies): - s = requests.Session() - requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) configProxy = config.getInstance().proxy() + s = requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: - browser.session.proxies = configProxy.proxies() + s.proxies = configProxy.proxies() + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) result = browser.open(url) if not result.ok: return '' @@ -108,14 +124,16 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): - s = None - if isinstance(cookies, dict) and len(cookies): - s = requests.Session() - requests.utils.add_dict_to_cookiejar(s.cookies, cookies) - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) configProxy = config.getInstance().proxy() + s = requests.Session() + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + retries = Retry(connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + s.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + s.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) if configProxy.enable: - browser.session.proxies = configProxy.proxies() + s.proxies = configProxy.proxies() + browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=s) result = browser.open(url) if not result.ok: return ''