update scrapinglib

- support specifiedUrl when scraping single movie
- support javlibrary and rating
This commit is contained in:
Mathhew
2022-07-28 18:45:54 +08:00
parent ee1306fb3b
commit ce388edce8
23 changed files with 379 additions and 176 deletions

View File

@@ -9,8 +9,9 @@ from cloudscraper import create_scraper
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
G_DEFAULT_TIMEOUT = 10
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None,
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
网页请求核心函数
@@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
raise Exception('Connect Failed')
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None,
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
是否使用代理应由上层处理
"""
@@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_
raise Exception('Connect Failed')
#
# TODO: 以下临时使用,更新完各站后,再更新
#
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
self.timeout = G_DEFAULT_TIMEOUT
@@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter):
return super().send(request, **kwargs)
# with keep-alive feature
# storyline carib gcolle javdb only
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
keep-alive
"""
session = requests.Session()
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
@@ -110,27 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ
if proxies:
session.proxies = proxies
session.headers = {"User-Agent": ua or G_USER_AGENT}
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用session对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "session":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
return session
# storyline only
# 使用 cloudscraper....