fix storyline

This commit is contained in:
Mathhew
2022-07-28 23:07:51 +08:00
parent 669b11b313
commit 6de2e8f60f
9 changed files with 61 additions and 136 deletions

View File

@@ -63,7 +63,7 @@ class Avsox(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
if self.morestoryline: if self.morestoryline:
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number) return getStoryline(self.number, proxies=self.proxies, verify=self.verify)
return '' return ''
def getActors(self, htmltree): def getActors(self, htmltree):

View File

@@ -92,7 +92,8 @@ class Carib(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
if self.morestoryline: if self.morestoryline:
from .storyline import getStoryline from .storyline import getStoryline
result = getStoryline(self.number, uncensored=self.uncensored) result = getStoryline(self.number, uncensored=self.uncensored,
proxies=self.proxies, verify=self.verify)
if len(result): if len(result):
return result return result
return super().getOutline(htmltree) return super().getOutline(htmltree)

View File

@@ -44,7 +44,7 @@ def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: s
raise Exception('Connect Failed') raise Exception('Connect Failed')
def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, def post(url: str, data: dict=None, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None,
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
""" """
是否使用代理应由上层处理 是否使用代理应由上层处理
@@ -109,46 +109,6 @@ def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEF
return session return session
# storyline only
# 使用 cloudscraper....
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, use_scraper: bool = False,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
if isinstance(url, str) and len(url):
result = browser.open(url)
else:
return browser
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "browser":
return result, browser
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_browser() Failed! {e}')
return None
# storyline xcity only # storyline xcity only
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
return_type: str = None, encoding: str = None, return_type: str = None, encoding: str = None,

View File

@@ -136,5 +136,6 @@ class Javbus(Parser):
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度 return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number , uncensored = self.uncensored) return getStoryline(self.number , uncensored = self.uncensored,
proxies=self.proxies, verify=self.verify)
return '' return ''

View File

@@ -176,7 +176,8 @@ class Javdb(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
if self.morestoryline: if self.morestoryline:
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree)) return getStoryline(self.number, self.getUncensored(htmltree),
proxies=self.proxies, verify=self.verify)
return '' return ''
def getTrailer(self, htmltree): def getTrailer(self, htmltree):

View File

@@ -76,5 +76,6 @@ class Javlibrary(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
if self.morestoryline: if self.morestoryline:
from .storyline import getStoryline from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree)) return getStoryline(self.number, self.getUncensored(htmltree),
proxies=self.proxies, verify=self.verify)
return '' return ''

View File

@@ -88,6 +88,8 @@ class Parser:
针对需要传递的参数: cookies, proxy等 针对需要传递的参数: cookies, proxy等
子类继承后修改 子类继承后修改
""" """
if not core:
return
if core.proxies: if core.proxies:
self.proxies = core.proxies self.proxies = core.proxies
if core.verify: if core.verify:

View File

@@ -5,6 +5,7 @@
""" """
import json
import os import os
import re import re
import time import time
@@ -13,7 +14,10 @@ import builtins
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml.html import fromstring from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session
from scrapinglib.airav import Airav
from scrapinglib.xcity import Xcity
from .httprequest import get_html_by_form, get_html_by_scraper, request_session
# 舍弃 Amazon 源 # 舍弃 Amazon 源
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"} G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
@@ -35,7 +39,7 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title = None, sites: list=None, uncensored=None): def getStoryline(number, title=None, sites: list=None, uncensored=None, proxies=None, verify=None):
start_time = time.time() start_time = time.time()
debug = False debug = False
storyine_sites = "1:avno1,4:airavwiki".split(',') storyine_sites = "1:avno1,4:airavwiki".split(',')
@@ -52,7 +56,7 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None):
r_dup.add(ns) r_dup.add(ns)
sort_sites.sort() sort_sites.sort()
apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites] apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites]
mp_args = ((site, number, title, debug) for site in apply_sites) mp_args = ((site, number, title, debug, proxies, verify) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count()) cores = min(len(apply_sites), os.cpu_count())
if cores == 0: if cores == 0:
return '' return ''
@@ -79,24 +83,21 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None):
def getStoryline_mp(args): def getStoryline_mp(args):
(site, number, title, debug) = args (site, number, title, debug, proxies, verify) = args
start_time = time.time() start_time = time.time()
storyline = None storyline = None
if not isinstance(site, str): if not isinstance(site, str):
return storyline return storyline
elif site == "airavwiki": elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug) storyline = getStoryline_airavwiki(number, debug, proxies, verify)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav": elif site == "airav":
storyline = getStoryline_airav(number, debug) storyline = getStoryline_airav(number, debug, proxies, verify)
elif site == "avno1": elif site == "avno1":
storyline = getStoryline_avno1(number, debug) storyline = getStoryline_avno1(number, debug, proxies, verify)
elif site == "xcity": elif site == "xcity":
storyline = getStoryline_xcity(number, debug) storyline = getStoryline_xcity(number, debug, proxies, verify)
# elif site == "amazon":
# storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo": elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug) storyline = getStoryline_58avgo(number, debug, proxies, verify)
if not debug: if not debug:
return storyline return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
@@ -108,11 +109,11 @@ def getStoryline_mp(args):
return storyline return storyline
def getStoryline_airav(number, debug): def getStoryline_airav(number, debug, proxies, verify):
try: try:
site = secrets.choice(('airav.cc','airav4.club')) site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
session = request_session() session = request_session(proxies=proxies, verify=verify)
res = session.get(url) res = session.get(url)
if not res: if not res:
raise ValueError(f"get_html_by_session('{url}') failed") raise ValueError(f"get_html_by_session('{url}') failed")
@@ -143,36 +144,16 @@ def getStoryline_airav(number, debug):
return None return None
def getStoryline_airavwiki(number, debug): def getStoryline_airavwiki(number, debug, proxies, verify):
try: try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://cn.airav.wiki/?search={kwd}' airavwiki = Airav()
result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True) airavwiki.addtion_Javbus = False
if not result.ok: airavwiki.proxies = proxies
raise ValueError(f"get_html_by_browser('{url}','{number}') failed") airavwiki.verify = verify
s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block') jsons = airavwiki.search(kwd)
link = None outline = json.loads(jsons).get('outline')
for a in s: return outline
title = a.img['title']
list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
if re.match(f'^{number}$', list_number, re.I):
link = a
break
elif re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or not re.search(number, browser.url, re.I):
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError(f"detail page number not match, got ->[{detail_number}]")
desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
return desc
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].") print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
@@ -180,7 +161,7 @@ def getStoryline_airavwiki(number, debug):
return '' return ''
def getStoryline_58avgo(number, debug): def getStoryline_58avgo(number, debug, proxies, verify):
try: try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
@@ -189,6 +170,7 @@ def getStoryline_58avgo(number, debug):
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url, result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
proxies=proxies, verify=verify,
return_type = 'browser') return_type = 'browser')
if not result: if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
@@ -219,13 +201,13 @@ def getStoryline_58avgo(number, debug):
return '' return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_avno1(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得
try: try:
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc', site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
'hotav.biz','iqq2.xyz','javhq.tv', 'hotav.biz','iqq2.xyz','javhq.tv',
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',]) 'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}' url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
lx = fromstring(get_html_by_scraper(url)) lx = fromstring(get_html_by_scraper(url, proxies=proxies, verify=verify))
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description') descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()') titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs): if not descs or not len(descs):
@@ -246,7 +228,7 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
return '' return ''
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 def getStoryline_avno1OLD(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得
try: try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
@@ -255,6 +237,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
result, browser = get_html_by_form(url, result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form', form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number}, fields = {'kw' : number},
proxies=proxies, verify=verify,
return_type = 'browser') return_type = 'browser')
if not result: if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed") raise ValueError(f"get_html_by_form('{url}','{number}') failed")
@@ -272,19 +255,14 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
return '' return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 def getStoryline_xcity(number, debug, proxies, verify): #获取剧情介绍 从xcity取得
try: try:
xcity_number = number.replace('-','') xcityEngine = Xcity()
query_result, browser = get_html_by_form( xcityEngine.proxies = proxies
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), xcityEngine.verify = verify
fields = {'q' : xcity_number.lower()}, jsons = xcityEngine.search(number)
return_type = 'browser') outline = json.loads(jsons).get('outline')
if not query_result or not query_result.ok: return outline
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e: except Exception as e:
if debug: if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")

View File

@@ -3,7 +3,6 @@
import re import re
import secrets import secrets
from urllib.parse import urljoin from urllib.parse import urljoin
from lxml import etree
from .httprequest import get_html_by_form from .httprequest import get_html_by_form
from .parser import Parser from .parser import Parser
@@ -27,6 +26,19 @@ class Xcity(Parser):
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()" expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()" expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
expr_extrafanart = '//div[@id="sample_images"]/div/a/@href' expr_extrafanart = '//div[@id="sample_images"]/div/a/@href'
expr_outline = '//head/meta[@property="og:description"]/@content'
def queryNumberUrl(self, number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
cookies=self.cookies, proxies=self.proxies, verify=self.verify,
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
prelink = browser.links('avod\/detail')[0]['href']
return urljoin('https://xcity.jp', prelink)
def getStudio(self, htmltree): def getStudio(self, htmltree):
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
@@ -55,12 +67,6 @@ class Xcity(Parser):
except: except:
return '' return ''
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, uncensored=False)
return ''
def getActorPhoto(self, htmltree): def getActorPhoto(self, htmltree):
treea = self.getTreeAll(htmltree, self.expr_actor_link) treea = self.getTreeAll(htmltree, self.expr_actor_link)
t = {i.text.strip(): i.attrib['href'] for i in treea} t = {i.text.strip(): i.attrib['href'] for i in treea}
@@ -84,28 +90,3 @@ class Xcity(Parser):
i = "https:" + i i = "https:" + i
extrafanart.append(i) extrafanart.append(i)
return extrafanart return extrafanart
def open_by_browser(self, number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def search(self, number):
self.number = number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
lx = self.getHtmlTree(self.detailurl)
else:
self.detail_page, self.browser = self.open_by_browser(number)
self.detailurl = self.browser.url
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
result = self.dictformat(lx)
return result