update scrapinglib
- support specifiedUrl when scraping single movie - support javlibrary and rating
This commit is contained in:
@@ -9,8 +9,9 @@ from cloudscraper import create_scraper
|
||||
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
|
||||
G_DEFAULT_TIMEOUT = 10
|
||||
|
||||
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
|
||||
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||
|
||||
def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None,
|
||||
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||
"""
|
||||
网页请求核心函数
|
||||
|
||||
@@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
|
||||
raise Exception('Connect Failed')
|
||||
|
||||
|
||||
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
|
||||
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||
def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None,
|
||||
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||
"""
|
||||
是否使用代理应由上层处理
|
||||
"""
|
||||
@@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_
|
||||
raise Exception('Connect Failed')
|
||||
|
||||
|
||||
#
|
||||
# TODO: 以下临时使用,更新完各站后,再更新
|
||||
#
|
||||
|
||||
|
||||
class TimeoutHTTPAdapter(HTTPAdapter):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.timeout = G_DEFAULT_TIMEOUT
|
||||
@@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
||||
return super().send(request, **kwargs)
|
||||
|
||||
|
||||
# with keep-alive feature
|
||||
# storyline carib gcolle javdb only
|
||||
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
|
||||
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||
def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||
"""
|
||||
keep-alive
|
||||
"""
|
||||
session = requests.Session()
|
||||
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504])
|
||||
@@ -110,27 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ
|
||||
if proxies:
|
||||
session.proxies = proxies
|
||||
session.headers = {"User-Agent": ua or G_USER_AGENT}
|
||||
try:
|
||||
if isinstance(url, str) and len(url):
|
||||
result = session.get(str(url))
|
||||
else: # 空url参数直接返回可重用session对象,无需设置return_type
|
||||
return session
|
||||
if not result.ok:
|
||||
return None
|
||||
if return_type == "object":
|
||||
return result
|
||||
elif return_type == "content":
|
||||
return result.content
|
||||
elif return_type == "session":
|
||||
return result, session
|
||||
else:
|
||||
result.encoding = encoding or "utf-8"
|
||||
return result.text
|
||||
except requests.exceptions.ProxyError:
|
||||
print("[-]get_html_session() Proxy error! Please check your Proxy")
|
||||
except Exception as e:
|
||||
print(f"[-]get_html_session() failed. {e}")
|
||||
return None
|
||||
return session
|
||||
|
||||
|
||||
# storyline only
|
||||
# 使用 cloudscraper....
|
||||
|
||||
Reference in New Issue
Block a user