Add crawler named gcolle.py #2
This commit is contained in:
@@ -70,52 +70,6 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
|
||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||
raise Exception('Connect Failed')
|
||||
|
||||
def get_html_requests_html(session, url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
||||
"""
|
||||
支持会话的网页请求核心函数
|
||||
Usage:
|
||||
from requests_html import HTMLSession
|
||||
session = HTMLSession() #New Session
|
||||
get_html_requests_html(session,"https://xxx.com/login")
|
||||
r = get_html_requests_html(session,"https://xxx.com/xxx")
|
||||
print(r)
|
||||
"""
|
||||
verify = config.getInstance().cacert_file()
|
||||
configProxy = config.getInstance().proxy()
|
||||
errors = ""
|
||||
|
||||
headers = {"User-Agent": ua or G_USER_AGENT} # noqa
|
||||
|
||||
for i in range(configProxy.retry):
|
||||
try:
|
||||
if configProxy.enable:
|
||||
proxies = configProxy.proxies()
|
||||
result = session.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies,
|
||||
verify=verify,
|
||||
cookies=cookies)
|
||||
else:
|
||||
result = session.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
|
||||
|
||||
if return_type == "object":
|
||||
return result
|
||||
elif return_type == "content":
|
||||
return result.content
|
||||
else:
|
||||
result.encoding = encoding or result.apparent_encoding
|
||||
return result.text
|
||||
except Exception as e:
|
||||
print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry))
|
||||
errors = str(e)
|
||||
if "getaddrinfo failed" in errors:
|
||||
print("[-]Connect Failed! Please Check your proxy config")
|
||||
debug = config.getInstance().debug()
|
||||
if debug:
|
||||
print("[-]" + errors)
|
||||
else:
|
||||
print("[-]" + errors)
|
||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||
raise Exception('Connect Failed')
|
||||
|
||||
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
||||
configProxy = config.getInstance().proxy()
|
||||
errors = ""
|
||||
|
||||
@@ -8,14 +8,15 @@ from requests_html import HTMLSession
|
||||
|
||||
def main(number):
|
||||
config_file = config.getInstance()
|
||||
browser = HTMLSession()
|
||||
|
||||
number = number.upper().replace('GCOLLE-','')
|
||||
session = get_html_session()
|
||||
|
||||
htmlcode = get_html_requests_html(browser,'https://gcolle.net/product_info.php/products_id/' + number)
|
||||
htmlcode = get_html_session('https://gcolle.net/product_info.php/products_id/' + number)
|
||||
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
||||
html = etree.HTML(htmlcode)
|
||||
# R18 countinue
|
||||
htmlcode = get_html_requests_html(browser,html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0])
|
||||
htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text
|
||||
gcolle_crawler = Crawler(htmlcode)
|
||||
|
||||
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
|
||||
|
||||
Reference in New Issue
Block a user