diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index 9cb5213..9c324a6 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -63,7 +63,7 @@ class Avsox(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - return getStoryline(self.number) + return getStoryline(self.number, proxies=self.proxies, verify=self.verify) return '' def getActors(self, htmltree): diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index decaba6..cc04ae7 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -92,7 +92,8 @@ class Carib(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - result = getStoryline(self.number, uncensored=self.uncensored) + result = getStoryline(self.number, uncensored=self.uncensored, + proxies=self.proxies, verify=self.verify) if len(result): return result return super().getOutline(htmltree) diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py index 7e99819..e987d63 100644 --- a/scrapinglib/httprequest.py +++ b/scrapinglib/httprequest.py @@ -44,7 +44,7 @@ def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: s raise Exception('Connect Failed') -def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, +def post(url: str, data: dict=None, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 是否使用代理应由上层处理 @@ -109,46 +109,6 @@ def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEF return session -# storyline only -# 使用 cloudscraper.... -def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, - encoding: str = None, use_scraper: bool = False, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): - session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session() - if isinstance(cookies, dict) and len(cookies): - requests.utils.add_dict_to_cookiejar(session.cookies, cookies) - retries = Retry(total=retry, connect=retry, backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504]) - session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) - session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) - if verify: - session.verify = verify - if proxies: - session.proxies = proxies - try: - browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session) - if isinstance(url, str) and len(url): - result = browser.open(url) - else: - return browser - if not result.ok: - return None - - if return_type == "object": - return result - elif return_type == "content": - return result.content - elif return_type == "browser": - return result, browser - else: - result.encoding = encoding or "utf-8" - return result.text - except requests.exceptions.ProxyError: - print("[-]get_html_by_browser() Proxy error! Please check your Proxy") - except Exception as e: - print(f'[-]get_html_by_browser() Failed! {e}') - return None - # storyline xcity only def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index 8e52de1..eb559c0 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -136,5 +136,6 @@ class Javbus(Parser): if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 from .storyline import getStoryline - return getStoryline(self.number , uncensored = self.uncensored) + return getStoryline(self.number , uncensored = self.uncensored, + proxies=self.proxies, verify=self.verify) return '' diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index 3cacd05..c21a819 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -176,7 +176,8 @@ class Javdb(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - return getStoryline(self.number, self.getUncensored(htmltree)) + return getStoryline(self.number, self.getUncensored(htmltree), + proxies=self.proxies, verify=self.verify) return '' def getTrailer(self, htmltree): diff --git a/scrapinglib/javlibrary.py b/scrapinglib/javlibrary.py index 782fa22..b2c7d19 100644 --- a/scrapinglib/javlibrary.py +++ b/scrapinglib/javlibrary.py @@ -76,5 +76,6 @@ class Javlibrary(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - return getStoryline(self.number, self.getUncensored(htmltree)) + return getStoryline(self.number, self.getUncensored(htmltree), + proxies=self.proxies, verify=self.verify) return '' diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index fa25d4e..90670ff 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -88,6 +88,8 @@ class Parser: 针对需要传递的参数: cookies, proxy等 子类继承后修改 """ + if not core: + return if core.proxies: self.proxies = core.proxies if core.verify: diff --git a/scrapinglib/storyline.py b/scrapinglib/storyline.py index 2194c76..306789a 100644 --- a/scrapinglib/storyline.py +++ b/scrapinglib/storyline.py @@ -5,6 +5,7 @@ """ +import json import os import re import time @@ -13,7 +14,10 @@ import builtins from urllib.parse import urljoin from lxml.html import fromstring from multiprocessing.dummy import Pool as ThreadPool -from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session + +from scrapinglib.airav import Airav +from scrapinglib.xcity import Xcity +from .httprequest import get_html_by_form, get_html_by_scraper, request_session # 舍弃 Amazon 源 G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"} @@ -35,7 +39,7 @@ class noThread(object): # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 -def getStoryline(number, title = None, sites: list=None, uncensored=None): +def getStoryline(number, title=None, sites: list=None, uncensored=None, proxies=None, verify=None): start_time = time.time() debug = False storyine_sites = "1:avno1,4:airavwiki".split(',') @@ -52,7 +56,7 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None): r_dup.add(ns) sort_sites.sort() apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites] - mp_args = ((site, number, title, debug) for site in apply_sites) + mp_args = ((site, number, title, debug, proxies, verify) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count()) if cores == 0: return '' @@ -79,24 +83,21 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None): def getStoryline_mp(args): - (site, number, title, debug) = args + (site, number, title, debug, proxies, verify) = args start_time = time.time() storyline = None if not isinstance(site, str): return storyline elif site == "airavwiki": - storyline = getStoryline_airavwiki(number, debug) - #storyline = getStoryline_airavwiki_super(number, debug) + storyline = getStoryline_airavwiki(number, debug, proxies, verify) elif site == "airav": - storyline = getStoryline_airav(number, debug) + storyline = getStoryline_airav(number, debug, proxies, verify) elif site == "avno1": - storyline = getStoryline_avno1(number, debug) + storyline = getStoryline_avno1(number, debug, proxies, verify) elif site == "xcity": - storyline = getStoryline_xcity(number, debug) - # elif site == "amazon": - # storyline = getStoryline_amazon(title, number, debug) + storyline = getStoryline_xcity(number, debug, proxies, verify) elif site == "58avgo": - storyline = getStoryline_58avgo(number, debug) + storyline = getStoryline_58avgo(number, debug, proxies, verify) if not debug: return storyline print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( @@ -108,11 +109,11 @@ def getStoryline_mp(args): return storyline -def getStoryline_airav(number, debug): +def getStoryline_airav(number, debug, proxies, verify): try: site = secrets.choice(('airav.cc','airav4.club')) url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' - session = request_session() + session = request_session(proxies=proxies, verify=verify) res = session.get(url) if not res: raise ValueError(f"get_html_by_session('{url}') failed") @@ -143,36 +144,16 @@ def getStoryline_airav(number, debug): return None -def getStoryline_airavwiki(number, debug): +def getStoryline_airavwiki(number, debug, proxies, verify): try: kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number - url = f'https://cn.airav.wiki/?search={kwd}' - result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True) - if not result.ok: - raise ValueError(f"get_html_by_browser('{url}','{number}') failed") - s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block') - link = None - for a in s: - title = a.img['title'] - list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip() - if kwd == number: # 番号PRED-164 和 RED-164需要能够区分 - if re.match(f'^{number}$', list_number, re.I): - link = a - break - elif re.search(number, list_number, re.I): - link = a - break - if link is None: - raise ValueError("number not found") - result = browser.follow_link(link) - if not result.ok or not re.search(number, browser.url, re.I): - raise ValueError("detail page not found") - title = browser.page.select('head > title')[0].text.strip() - detail_number = str(re.findall('\[(.*?)]', title)[0]) - if not re.search(number, detail_number, re.I): - raise ValueError(f"detail page number not match, got ->[{detail_number}]") - desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip() - return desc + airavwiki = Airav() + airavwiki.addtion_Javbus = False + airavwiki.proxies = proxies + airavwiki.verify = verify + jsons = airavwiki.search(kwd) + outline = json.loads(jsons).get('outline') + return outline except Exception as e: if debug: print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].") @@ -180,7 +161,7 @@ def getStoryline_airavwiki(number, debug): return '' -def getStoryline_58avgo(number, debug): +def getStoryline_58avgo(number, debug, proxies, verify): try: url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', @@ -189,6 +170,7 @@ def getStoryline_58avgo(number, debug): kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number result, browser = get_html_by_form(url, fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, + proxies=proxies, verify=verify, return_type = 'browser') if not result: raise ValueError(f"get_html_by_form('{url}','{number}') failed") @@ -219,13 +201,13 @@ def getStoryline_58avgo(number, debug): return '' -def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 +def getStoryline_avno1(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得 try: site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc', 'hotav.biz','iqq2.xyz','javhq.tv', 'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',]) url = f'http://{site}/cn/search.php?kw_type=key&kw={number}' - lx = fromstring(get_html_by_scraper(url)) + lx = fromstring(get_html_by_scraper(url, proxies=proxies, verify=verify)) descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description') titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()') if not descs or not len(descs): @@ -246,7 +228,7 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 return '' -def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 +def getStoryline_avno1OLD(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得 try: url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), @@ -255,6 +237,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 result, browser = get_html_by_form(url, form_select='div.wrapper > div.header > div.search > form', fields = {'kw' : number}, + proxies=proxies, verify=verify, return_type = 'browser') if not result: raise ValueError(f"get_html_by_form('{url}','{number}') failed") @@ -272,19 +255,14 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 return '' -def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 +def getStoryline_xcity(number, debug, proxies, verify): #获取剧情介绍 从xcity取得 try: - xcity_number = number.replace('-','') - query_result, browser = get_html_by_form( - 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), - fields = {'q' : xcity_number.lower()}, - return_type = 'browser') - if not query_result or not query_result.ok: - raise ValueError("page not found") - result = browser.follow_link(browser.links('avod\/detail')[0]) - if not result.ok: - raise ValueError("detail page not found") - return browser.page.select_one('h2.title-detail + p.lead').text.strip() + xcityEngine = Xcity() + xcityEngine.proxies = proxies + xcityEngine.verify = verify + jsons = xcityEngine.search(number) + outline = json.loads(jsons).get('outline') + return outline except Exception as e: if debug: print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") diff --git a/scrapinglib/xcity.py b/scrapinglib/xcity.py index 36230bb..05beb39 100644 --- a/scrapinglib/xcity.py +++ b/scrapinglib/xcity.py @@ -3,7 +3,6 @@ import re import secrets from urllib.parse import urljoin -from lxml import etree from .httprequest import get_html_by_form from .parser import Parser @@ -27,6 +26,19 @@ class Xcity(Parser): expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()" expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()" expr_extrafanart = '//div[@id="sample_images"]/div/a/@href' + expr_outline = '//head/meta[@property="og:description"]/@content' + + def queryNumberUrl(self, number): + xcity_number = number.replace('-','') + query_result, browser = get_html_by_form( + 'https://xcity.jp/' + secrets.choice(['sitemap/','policy/','law/','help/','main/']), + fields = {'q' : xcity_number.lower()}, + cookies=self.cookies, proxies=self.proxies, verify=self.verify, + return_type = 'browser') + if not query_result or not query_result.ok: + raise ValueError("xcity.py: page not found") + prelink = browser.links('avod\/detail')[0]['href'] + return urljoin('https://xcity.jp', prelink) def getStudio(self, htmltree): return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') @@ -55,12 +67,6 @@ class Xcity(Parser): except: return '' - def getOutline(self, htmltree): - if self.morestoryline: - from .storyline import getStoryline - return getStoryline(self.number, uncensored=False) - return '' - def getActorPhoto(self, htmltree): treea = self.getTreeAll(htmltree, self.expr_actor_link) t = {i.text.strip(): i.attrib['href'] for i in treea} @@ -84,28 +90,3 @@ class Xcity(Parser): i = "https:" + i extrafanart.append(i) return extrafanart - - def open_by_browser(self, number): - xcity_number = number.replace('-','') - query_result, browser = get_html_by_form( - 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), - fields = {'q' : xcity_number.lower()}, - return_type = 'browser') - if not query_result or not query_result.ok: - raise ValueError("xcity.py: page not found") - result = browser.follow_link(browser.links('avod\/detail')[0]) - if not result.ok: - raise ValueError("xcity.py: detail page not found") - return str(browser.page), browser - - def search(self, number): - self.number = number - if self.specifiedUrl: - self.detailurl = self.specifiedUrl - lx = self.getHtmlTree(self.detailurl) - else: - self.detail_page, self.browser = self.open_by_browser(number) - self.detailurl = self.browser.url - lx = etree.fromstring(self.detail_page, etree.HTMLParser()) - result = self.dictformat(lx) - return result