From 87972b033599300a3a873e9083866adb0ca6236a Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Tue, 19 Apr 2022 21:27:09 +0800 Subject: [PATCH] Add crawler named gcolle.py #2 --- ADC_function.py | 46 -------------------------------------------- WebCrawler/gcolle.py | 7 ++++--- 2 files changed, 4 insertions(+), 49 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 8272b89..c827a59 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -70,52 +70,6 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None, print('[-]Connect Failed! Please check your Proxy or Network!') raise Exception('Connect Failed') -def get_html_requests_html(session, url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None): - """ - 支持会话的网页请求核心函数 - Usage: - from requests_html import HTMLSession - session = HTMLSession() #New Session - get_html_requests_html(session,"https://xxx.com/login") - r = get_html_requests_html(session,"https://xxx.com/xxx") - print(r) - """ - verify = config.getInstance().cacert_file() - configProxy = config.getInstance().proxy() - errors = "" - - headers = {"User-Agent": ua or G_USER_AGENT} # noqa - - for i in range(configProxy.retry): - try: - if configProxy.enable: - proxies = configProxy.proxies() - result = session.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies, - verify=verify, - cookies=cookies) - else: - result = session.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies) - - if return_type == "object": - return result - elif return_type == "content": - return result.content - else: - result.encoding = encoding or result.apparent_encoding - return result.text - except Exception as e: - print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry)) - errors = str(e) - if "getaddrinfo failed" in errors: - print("[-]Connect Failed! Please Check your proxy config") - debug = config.getInstance().debug() - if debug: - print("[-]" + errors) - else: - print("[-]" + errors) - print('[-]Connect Failed! Please check your Proxy or Network!') - raise Exception('Connect Failed') - def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: configProxy = config.getInstance().proxy() errors = "" diff --git a/WebCrawler/gcolle.py b/WebCrawler/gcolle.py index b0d387a..f49a5b9 100644 --- a/WebCrawler/gcolle.py +++ b/WebCrawler/gcolle.py @@ -8,14 +8,15 @@ from requests_html import HTMLSession def main(number): config_file = config.getInstance() - browser = HTMLSession() number = number.upper().replace('GCOLLE-','') + session = get_html_session() - htmlcode = get_html_requests_html(browser,'https://gcolle.net/product_info.php/products_id/' + number) + htmlcode = get_html_session('https://gcolle.net/product_info.php/products_id/' + number) + htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text html = etree.HTML(htmlcode) # R18 countinue - htmlcode = get_html_requests_html(browser,html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]) + htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text gcolle_crawler = Crawler(htmlcode) number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')