update scrapinglib

This commit is contained in:
Mathhew
2022-05-27 15:24:29 +08:00
parent d6d0a1687b
commit 9898f2918f
16 changed files with 213 additions and 73 deletions

View File

@@ -11,10 +11,13 @@ class Parser:
source = 'base'
imagecut = 1
uncensored = False
allow_number_change = False
# update
proxies = None
cookies = None
verify = None
extraheader = None
cookies = None
morestoryline = False
number = ''
detailurl = ''
@@ -47,12 +50,15 @@ class Parser:
def __init__(self) -> None:
pass
def search(self, number, core: None):
""" 搜索番号
def scrape(self, number, core: None):
""" 刮削番号
"""
self.number = number
self.updateCore(core)
result = self.search(number)
return result
def search(self, number):
self.number = number
self.detailurl = self.queryNumberUrl(number)
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
@@ -66,6 +72,10 @@ class Parser:
"""
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
def queryNumberUrl(self, number):
""" 根据番号查询详细信息url
@@ -78,7 +88,7 @@ class Parser:
def getHtml(self, url, type = None):
""" 访问网页
"""
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, verify=self.verify, return_type=type)
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, verify=self.verify, return_type=type)
if '<title>404 Page Not Found' in resp \
or '<title>未找到页面' in resp \
or '404 Not Found' in resp \