diff --git a/ADC_function.py b/ADC_function.py index 21fda6a..2198939 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -14,6 +14,7 @@ from urllib.parse import urljoin import mechanicalsoup from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry +from cloudscraper import create_scraper def getXpathSingle(htmlcode, xpath): @@ -25,7 +26,7 @@ def getXpathSingle(htmlcode, xpath): G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36' # 网页请求核心 -def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): verify = config.getInstance().cacert_file() configProxy = config.getInstance().proxy() errors = "" @@ -41,13 +42,12 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None) else: result = requests.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies) - result.encoding = "utf-8" - if return_type == "object": return result elif return_type == "content": return result.content else: + result.encoding = encoding or "utf-8" return result.text except requests.exceptions.ProxyError: print("[-]Proxy error! Please check your Proxy") @@ -100,7 +100,7 @@ class TimeoutHTTPAdapter(HTTPAdapter): # with keep-alive feature -def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html_session(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): configProxy = config.getInstance().proxy() session = requests.Session() if isinstance(cookies, dict) and len(cookies): @@ -127,7 +127,7 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur elif return_type == "session": return result, session else: - result.encoding = "utf-8" + result.encoding = encoding or "utf-8" return result.text except requests.exceptions.ProxyError: print("[-]get_html_session() Proxy error! Please check your Proxy") @@ -136,7 +136,7 @@ def get_html_session(url:str = None, cookies: dict = None, ua: str = None, retur return None -def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None): +def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): configProxy = config.getInstance().proxy() s = requests.Session() if isinstance(cookies, dict) and len(cookies): @@ -155,7 +155,7 @@ def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, re return browser if not result.ok: return None - result.encoding = "utf-8" + if return_type == "object": return result elif return_type == "content": @@ -163,6 +163,7 @@ def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, re elif return_type == "browser": return result, browser else: + result.encoding = encoding or "utf-8" return result.text except requests.exceptions.ProxyError: print("[-]get_html_by_browser() Proxy error! Please check your Proxy") @@ -170,7 +171,8 @@ def get_html_by_browser(url:str = None, cookies: dict = None, ua: str = None, re print(f'[-]get_html_by_browser() Failed! {e}') return None -def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): + +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): configProxy = config.getInstance().proxy() s = requests.Session() if isinstance(cookies, dict) and len(cookies): @@ -191,7 +193,7 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: for k, v in fields.items(): browser[k] = v response = browser.submit_selected() - response.encoding = "utf-8" + if return_type == "object": return response elif return_type == "content": @@ -199,6 +201,7 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: elif return_type == "browser": return response, browser else: + result.encoding = encoding or "utf-8" return response.text except requests.exceptions.ProxyError: print("[-]get_html_by_form() Proxy error! Please check your Proxy") @@ -207,6 +210,40 @@ def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: return None +def get_html_by_scraper(url:str = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): + configProxy = config.getInstance().proxy() + session = create_scraper(browser={'custom': ua or G_USER_AGENT,}) + if isinstance(cookies, dict) and len(cookies): + requests.utils.add_dict_to_cookiejar(session.cookies, cookies) + retries = Retry(total=configProxy.retry, connect=configProxy.retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=configProxy.timeout)) + if configProxy.enable: + session.verify = config.getInstance().cacert_file() + session.proxies = configProxy.proxies() + try: + if isinstance(url, str) and len(url): + result = session.get(str(url)) + else: # 空url参数直接返回可重用scraper对象,无需设置return_type + return session + if not result.ok: + return None + if return_type == "object": + return result + elif return_type == "content": + return result.content + elif return_type == "scraper": + return result, session + else: + result.encoding = encoding or "utf-8" + return result.text + except requests.exceptions.ProxyError: + print("[-]get_html_session() Proxy error! Please check your Proxy") + except Exception as e: + print(f"[-]get_html_session() failed. {e}") + return None + + # def get_javlib_cookie() -> [dict, str]: # import cloudscraper # switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy() @@ -701,31 +738,35 @@ def is_japanese(s) -> bool: return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE)) +# Usage: python ./ADC_function.py https://cn.bing.com/ if __name__ == "__main__": import sys, timeit from http.client import HTTPConnection - s = get_html_session() def benchmark(t, url): print(f"HTTP GET Benchmark times:{t} url:{url}") tm = timeit.timeit(f"_ = session1.get('{url}')", "from __main__ import get_html_session;session1=get_html_session()", number=t) - print(f'===={tm:2.5f}s get_html_session() Keep-Alive enable====') + print(f' *{tm:>10.5f}s get_html_session() Keep-Alive enable') + tm = timeit.timeit(f"_ = scraper1.get('{url}')", + "from __main__ import get_html_by_scraper;scraper1=get_html_by_scraper()", + number=t) + print(f' *{tm:>10.5f}s get_html_by_scraper() Keep-Alive enable') tm = timeit.timeit(f"_ = browser1.open('{url}')", "from __main__ import get_html_by_browser;browser1=get_html_by_browser()", number=t) - print(f'===={tm:2.5f}s get_html_by_browser() Keep-Alive enable====') + print(f' *{tm:>10.5f}s get_html_by_browser() Keep-Alive enable') tm = timeit.timeit(f"_ = get_html('{url}')", "from __main__ import get_html", number=t) - print(f'===={tm:2.5f}s get_html() ====') + print(f' *{tm:>10.5f}s get_html()') t = 100 #url = "https://www.189.cn/" url = "http://www.chinaunicom.com" HTTPConnection.debuglevel = 1 + s = get_html_session() _ = s.get(url) HTTPConnection.debuglevel = 0 - # Usage: python ./ADC_function.py https://cn.bing.com/ if len(sys.argv)>1: url = sys.argv[1] benchmark(t, url) diff --git a/Makefile b/Makefile index 3af212f..fdc4e2a 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,7 @@ make: @echo "[+]Pyinstaller make" pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "`python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1`:cloudscraper" \ --add-data "`python3 -c 'import opencc as _; print(_.__path__[0])' | tail -n 1`:opencc" \ --add-data "Img:Img" \ --add-data "config.ini:." \ diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index c2ab57b..9adb7f9 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -30,7 +30,7 @@ def getActor(html): return r def getaphoto(url, session): - html_page = session.get(url).text if isinstance(session, requests.Session) else get_html(url) + html_page = session.get(url).text if session is not None else get_html(url) img_prether = re.compile(r'[{page_number}] not match") + except Exception as e: + if debug: + print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 try: url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), @@ -343,6 +366,8 @@ def amazon_select_one(a_titles, q_title, number, debug): if pos < 0: if category(char) == 'Nd': return -1 + if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U): + return -1 ama_t = ama_t[:cloc] findlen = 0 lastpos = -1