Add crawler named gcolle.py #2
This commit is contained in:
@@ -70,52 +70,6 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None,
|
|||||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||||
raise Exception('Connect Failed')
|
raise Exception('Connect Failed')
|
||||||
|
|
||||||
def get_html_requests_html(session, url, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None):
|
|
||||||
"""
|
|
||||||
支持会话的网页请求核心函数
|
|
||||||
Usage:
|
|
||||||
from requests_html import HTMLSession
|
|
||||||
session = HTMLSession() #New Session
|
|
||||||
get_html_requests_html(session,"https://xxx.com/login")
|
|
||||||
r = get_html_requests_html(session,"https://xxx.com/xxx")
|
|
||||||
print(r)
|
|
||||||
"""
|
|
||||||
verify = config.getInstance().cacert_file()
|
|
||||||
configProxy = config.getInstance().proxy()
|
|
||||||
errors = ""
|
|
||||||
|
|
||||||
headers = {"User-Agent": ua or G_USER_AGENT} # noqa
|
|
||||||
|
|
||||||
for i in range(configProxy.retry):
|
|
||||||
try:
|
|
||||||
if configProxy.enable:
|
|
||||||
proxies = configProxy.proxies()
|
|
||||||
result = session.get(str(url), headers=headers, timeout=configProxy.timeout, proxies=proxies,
|
|
||||||
verify=verify,
|
|
||||||
cookies=cookies)
|
|
||||||
else:
|
|
||||||
result = session.get(str(url), headers=headers, timeout=configProxy.timeout, cookies=cookies)
|
|
||||||
|
|
||||||
if return_type == "object":
|
|
||||||
return result
|
|
||||||
elif return_type == "content":
|
|
||||||
return result.content
|
|
||||||
else:
|
|
||||||
result.encoding = encoding or result.apparent_encoding
|
|
||||||
return result.text
|
|
||||||
except Exception as e:
|
|
||||||
print("[-]Connect retry {}/{}".format(i + 1, configProxy.retry))
|
|
||||||
errors = str(e)
|
|
||||||
if "getaddrinfo failed" in errors:
|
|
||||||
print("[-]Connect Failed! Please Check your proxy config")
|
|
||||||
debug = config.getInstance().debug()
|
|
||||||
if debug:
|
|
||||||
print("[-]" + errors)
|
|
||||||
else:
|
|
||||||
print("[-]" + errors)
|
|
||||||
print('[-]Connect Failed! Please check your Proxy or Network!')
|
|
||||||
raise Exception('Connect Failed')
|
|
||||||
|
|
||||||
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
|
||||||
configProxy = config.getInstance().proxy()
|
configProxy = config.getInstance().proxy()
|
||||||
errors = ""
|
errors = ""
|
||||||
|
|||||||
@@ -8,14 +8,15 @@ from requests_html import HTMLSession
|
|||||||
|
|
||||||
def main(number):
|
def main(number):
|
||||||
config_file = config.getInstance()
|
config_file = config.getInstance()
|
||||||
browser = HTMLSession()
|
|
||||||
|
|
||||||
number = number.upper().replace('GCOLLE-','')
|
number = number.upper().replace('GCOLLE-','')
|
||||||
|
session = get_html_session()
|
||||||
|
|
||||||
htmlcode = get_html_requests_html(browser,'https://gcolle.net/product_info.php/products_id/' + number)
|
htmlcode = get_html_session('https://gcolle.net/product_info.php/products_id/' + number)
|
||||||
|
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
||||||
html = etree.HTML(htmlcode)
|
html = etree.HTML(htmlcode)
|
||||||
# R18 countinue
|
# R18 countinue
|
||||||
htmlcode = get_html_requests_html(browser,html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0])
|
htmlcode = session.get(html.xpath('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')[0]).text
|
||||||
gcolle_crawler = Crawler(htmlcode)
|
gcolle_crawler = Crawler(htmlcode)
|
||||||
|
|
||||||
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
|
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
|
||||||
|
|||||||
Reference in New Issue
Block a user