From 4b83f39241a0e321f6768396e42cfcdae973f478 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Thu, 28 Jul 2022 17:51:45 +0800 Subject: [PATCH 1/6] fix user rating --- core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core.py b/core.py index 98c538d..dcfd173 100644 --- a/core.py +++ b/core.py @@ -404,8 +404,8 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f except: pass try: - f_rating = json_data['用户评分'] - uc = json_data['评分人数'] + f_rating = json_data.get('userrating') + uc = json_data.get('uservotes') print(f""" {round(f_rating * 2.0, 1)} {round(f_rating * 20.0, 1)} From 17d0c638dcb0f566a387b7d4de165c6329d466ff Mon Sep 17 00:00:00 2001 From: Mathhew Date: Thu, 28 Jul 2022 17:56:22 +0800 Subject: [PATCH 2/6] fix(carib): morestoryline --- scrapinglib/carib.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index af99d57..3d9f094 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -87,9 +87,10 @@ class Carib(Parser): return o def getOutline(self, htmltree): - from .storyline import getStoryline - result = getStoryline(self.number, uncensored=self.uncensored) - if len(result): - return result + if self.morestoryline: + from .storyline import getStoryline + result = getStoryline(self.number, uncensored=self.uncensored) + if len(result): + return result return super().getOutline(htmltree) From ee1306fb3bae3740841ff2346a962aca5a6fcbdd Mon Sep 17 00:00:00 2001 From: Mathhew Date: Thu, 28 Jul 2022 17:58:17 +0800 Subject: [PATCH 3/6] fix(madou): split tags --- scrapinglib/madou.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index d15a2ee..f132c22 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -59,5 +59,5 @@ class Madou(Parser): def getTags(self, htmltree): studio = self.getStudio(htmltree) - x = super().getTags(htmltree).split(',') + x = super().getTags(htmltree) return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i] From ce388edce8fead5c520d244988a9fbc8a967c5cd Mon Sep 17 00:00:00 2001 From: Mathhew Date: Thu, 28 Jul 2022 18:45:54 +0800 Subject: [PATCH 4/6] update scrapinglib - support specifiedUrl when scraping single movie - support javlibrary and rating --- scrapinglib/__init__.py | 2 +- scrapinglib/airav.py | 100 ++++++++++++++++++++++--------------- scrapinglib/api.py | 49 ++++++++++++++---- scrapinglib/avsox.py | 8 ++- scrapinglib/carib.py | 5 +- scrapinglib/dlsite.py | 7 ++- scrapinglib/fanza.py | 15 +++--- scrapinglib/fc2.py | 7 ++- scrapinglib/gcolle.py | 11 ++-- scrapinglib/getchu.py | 5 +- scrapinglib/httprequest.py | 45 ++++------------- scrapinglib/imdb.py | 24 +++++++++ scrapinglib/jav321.py | 8 +++ scrapinglib/javbus.py | 12 ++++- scrapinglib/javdb.py | 25 ++++++---- scrapinglib/javlibrary.py | 80 +++++++++++++++++++++++++++++ scrapinglib/madou.py | 6 ++- scrapinglib/mgstage.py | 5 +- scrapinglib/mv91.py | 6 ++- scrapinglib/parser.py | 57 ++++++++++++++++----- scrapinglib/storyline.py | 5 +- scrapinglib/tmdb.py | 13 ++--- scrapinglib/xcity.py | 60 +++++++++++----------- 23 files changed, 379 insertions(+), 176 deletions(-) create mode 100644 scrapinglib/imdb.py create mode 100644 scrapinglib/javlibrary.py diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py index 9f2e761..ee27a25 100644 --- a/scrapinglib/__init__.py +++ b/scrapinglib/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -from .api import search +from .api import search, getSupportedSources diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py index c22384d..f0e2e39 100644 --- a/scrapinglib/airav.py +++ b/scrapinglib/airav.py @@ -8,6 +8,9 @@ from .javbus import Javbus class Airav(Parser): source = 'airav' + # for javbus + specifiedSource = None + addtion_Javbus = True expr_title = '/html/head/title/text()' expr_number = '/html/head/title/text()' @@ -21,23 +24,38 @@ class Airav(Parser): def search(self, number): self.number = number - self.detailurl = 'https://cn.airav.wiki/video/' + number - engine = Javbus() - javbusinfo = engine.scrape(number, self) - if javbusinfo == 404: - self.javbus = {"title": ""} + if self.specifiedUrl: + self.detailurl = self.specifiedUrl else: - self.javbus = json.loads(javbusinfo) + self.detailurl = self.queryNumberUrl(self.number) + if self.addtion_Javbus: + engine = Javbus() + javbusinfo = engine.scrape(self.number, self) + if javbusinfo == 404: + self.javbus = {"title": ""} + else: + self.javbus = json.loads(javbusinfo) self.htmlcode = self.getHtml(self.detailurl) htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) result = self.dictformat(htmltree) return result + def queryNumberUrl(self, number): + queryUrl = "https://cn.airav.wiki/?search=" + number + queryTree = self.getHtmlTree(queryUrl) + results = self.getTreeAll(queryTree, '//div[contains(@class,"videoList")]/div/a') + for i in results: + num = self.getTreeElement(i, '//div/div[contains(@class,"videoNumber")]/p[1]/text()') + if num.replace('-','') == number.replace('-','').upper(): + self.number = num + return "https://cn.airav.wiki" + i.attrib['href'] + return 'https://cn.airav.wiki/video/' + number + def getNum(self, htmltree): - # return super().getNum(htmltree) - result = self.javbus.get('number') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('number') + if isinstance(result, str) and len(result): + return result number = super().getNum(htmltree) result = str(re.findall('^\[(.*?)]', number)[0]) return result @@ -48,24 +66,27 @@ class Airav(Parser): return result def getStudio(self, htmltree): - result = self.javbus.get('studio') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('studio') + if isinstance(result, str) and len(result): + return result return super().getStudio(htmltree) def getRelease(self, htmltree): - result = self.javbus.get('release') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('release') + if isinstance(result, str) and len(result): + return result try: return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group() except: return '' def getYear(self, htmltree): - result = self.javbus.get('year') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('year') + if isinstance(result, str) and len(result): + return result release = self.getRelease(htmltree) return str(re.findall('\d{4}', release)).strip(" ['']") @@ -73,39 +94,40 @@ class Airav(Parser): return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip() def getRuntime(self, htmltree): - result = self.javbus.get('runtime') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('runtime') + if isinstance(result, str) and len(result): + return result return '' def getDirector(self, htmltree): - result = self.javbus.get('director') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('director') + if isinstance(result, str) and len(result): + return result return '' def getActors(self, htmltree): - b=[] a = super().getActors(htmltree) - for v in a: - v = v.strip() - if len(v): - b.append(v) + b = [ i.strip() for i in a if len(i)] if len(b): return b - result = self.javbus.get('actor') - if isinstance(result, list) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('actor') + if isinstance(result, list) and len(result): + return result return [] def getCover(self, htmltree): - result = self.javbus.get('cover') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('cover') + if isinstance(result, str) and len(result): + return result return super().getCover(htmltree) def getSeries(self, htmltree): - result = self.javbus.get('series') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('series') + if isinstance(result, str) and len(result): + return result return '' diff --git a/scrapinglib/api.py b/scrapinglib/api.py index c8c4679..4314dee 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -18,29 +18,45 @@ from .mgstage import Mgstage from .javbus import Javbus from .xcity import Xcity from .avsox import Avsox +from .javlibrary import Javlibrary from .tmdb import Tmdb +from .imdb import Imdb def search(number, sources: str=None, proxies=None, verify=None, type='adult', + specifiedSource=None, specifiedUrl=None, dbcookies=None, dbsite=None, morestoryline=False): - """ 根据``番号/电影``名搜索信息 + """ 根据`番号/电影`名搜索信息 :param number: number/name depends on type - :param sources: sources string with `,` like ``avsox,javbus`` - :param type: ``adult``, ``general`` + :param sources: sources string with `,` Eg: `avsox,javbus` + :param type: `adult`, `general` """ sc = Scraping() return sc.search(number, sources, proxies=proxies, verify=verify, type=type, + specifiedSource=specifiedSource, specifiedUrl=specifiedUrl, dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline) + +def getSupportedSources(tag='adult'): + """ + :param tag: `adult`, `general` + """ + sc = Scraping() + if tag == 'adult': + return ','.join(sc.adult_full_sources) + else: + return ','.join(sc.general_full_sources) + + class Scraping(): """ """ - - adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', - 'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91', - 'gcolle', 'javdb', 'getchu'] + adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', + 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91', + 'getchu', 'gcolle' + ] adult_func_mapping = { 'avsox': Avsox().scrape, 'javbus': Javbus().scrape, @@ -57,15 +73,19 @@ class Scraping(): 'gcolle': Gcolle().scrape, 'javdb': Javdb().scrape, 'getchu': Getchu().scrape, + 'javlibrary': Javlibrary().scrape, } - general_full_sources = ['tmdb'] + general_full_sources = ['tmdb','imdb'] general_func_mapping = { 'tmdb': Tmdb().scrape, + 'imdb': Imdb().scrape, } proxies = None verify = None + specifiedSource = None + specifiedUrl = None dbcookies = None dbsite = None @@ -73,9 +93,12 @@ class Scraping(): morestoryline = False def search(self, number, sources=None, proxies=None, verify=None, type='adult', + specifiedSource=None, specifiedUrl=None, dbcookies=None, dbsite=None, morestoryline=False): self.proxies = proxies self.verify = verify + self.specifiedSource = specifiedSource + self.specifiedUrl = specifiedUrl self.dbcookies = dbcookies self.dbsite = dbsite self.morestoryline = morestoryline @@ -88,7 +111,10 @@ class Scraping(): """ 查询电影电视剧 imdb,tmdb """ - sources = self.checkGeneralSources(sources, name) + if self.specifiedSource: + sources = [self.specifiedSource] + else: + sources = self.checkGeneralSources(sources, name) json_data = {} for source in sources: try: @@ -116,7 +142,10 @@ class Scraping(): return json_data def searchAdult(self, number, sources): - sources = self.checkAdultSources(sources, number) + if self.specifiedSource: + sources = [self.specifiedSource] + else: + sources = self.checkAdultSources(sources, number) json_data = {} for source in sources: try: diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index c41cb6e..9cb5213 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -50,10 +50,14 @@ class Avsox(Parser): def getSmallCover(self, htmltree): """ 使用搜索页面的预览小图 """ - return self.getTreeElement(self.searchtree, self.expr_smallcover) + try: + return self.getTreeElement(self.searchtree, self.expr_smallcover) + except: + self.imagecut = 1 + return '' def getTags(self, htmltree): - tags = self.getTreeElement(htmltree).split(',') + tags = self.getTreeElement(htmltree, self.expr_tags).split(',') return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] def getOutline(self, htmltree): diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 3d9f094..decaba6 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -22,7 +22,10 @@ class Carib(Parser): def search(self, number): self.number = number - self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' htmlcode = self.getHtml(self.detailurl) if htmlcode == 404 or 'class="movie-info section"' not in htmlcode: return 404 diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index cc701c9..6edd854 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -29,7 +29,12 @@ class Dlsite(Parser): def search(self, number): self.cookies = {'locale': 'zh-cn'} - if "RJ" in number or "VJ" in number: + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + # TODO 应该从页面内获取 number + self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']") + htmltree = self.getHtmlTree(self.detailurl) + elif "RJ" in number or "VJ" in number: self.number = number.upper() self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN' htmltree = self.getHtmlTree(self.detailurl) diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index f6cc01c..2706b91 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -11,15 +11,21 @@ class Fanza(Parser): expr_title = '//*[starts-with(@id, "title")]/text()' expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - expr_cover = '//head/meta[@property="og:image"]' + expr_cover = './/head/meta[@property="og:image"]/@content' expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" - expr_outline_og = '//head/meta[@property="og:description"]' + expr_outline_og = '//head/meta[@property="og:description"]/@content' expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" def search(self, number): self.number = number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + durl = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl}) + self.htmltree = self.getHtmlTree(durl) + result = self.dictformat(self.htmltree) + return result # fanza allow letter + number + underscore, normalize the input here # @note: I only find the usage of underscore as h_test123456789 fanza_search_number = number @@ -75,7 +81,7 @@ class Fanza(Parser): if result == '': result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "") if "※ 配信方法によって収録内容が異なる場合があります。" == result: - result = self.getTreeElement(htmltree, self.expr_outline_og).get('content') + result = self.getTreeElement(htmltree, self.expr_outline_og) return result except: return '' @@ -99,9 +105,6 @@ class Fanza(Parser): result = self.getFanzaString('配信開始日:') return result.replace("/", "-").strip('\\n') - def getCover(self, htmltree): - return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content') - def getTags(self, htmltree): return self.getFanzaStrings('ジャンル:') diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index 13640ed..6707682 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -22,8 +22,11 @@ class Fc2(Parser): expr_tags = "//a[@class='tag tagTag']/text()" def search(self, number): - self.number = number.replace('FC2-', '').replace('fc2-', '') - self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' + self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '') + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: return 404 diff --git a/scrapinglib/gcolle.py b/scrapinglib/gcolle.py index 100e3ef..c6d7027 100644 --- a/scrapinglib/gcolle.py +++ b/scrapinglib/gcolle.py @@ -2,7 +2,7 @@ import re from lxml import etree -from .httprequest import get_html_session +from .httprequest import request_session from .parser import Parser @@ -27,9 +27,12 @@ class Gcolle(Parser): def search(self, number): self.number = number.upper().replace('GCOLLE-','') - self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number - session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) - htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + self.number).text + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number + session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + htmlcode = session.get(self.detailurl).text htmltree = etree.HTML(htmlcode) r18url = self.getTreeElement(htmltree, self.expr_r18) diff --git a/scrapinglib/getchu.py b/scrapinglib/getchu.py index eec16ec..1372ba8 100644 --- a/scrapinglib/getchu.py +++ b/scrapinglib/getchu.py @@ -35,7 +35,7 @@ class wwwGetchu(Parser): GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' expr_title = '//*[@id="soft-title"]/text()' - expr_cover = '//head/meta[@property="og:image"]' + expr_cover = '//head/meta[@property="og:image"]/@content' expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" @@ -67,9 +67,6 @@ class wwwGetchu(Parser): def getNum(self, htmltree): return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0] - def getCover(self, htmltree): - return self.getTreeElement(htmltree, self.expr_cover).get('content') - def getActors(self, htmltree): return super().getDirector(htmltree) diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py index 997ff39..7e99819 100644 --- a/scrapinglib/httprequest.py +++ b/scrapinglib/httprequest.py @@ -9,8 +9,9 @@ from cloudscraper import create_scraper G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' G_DEFAULT_TIMEOUT = 10 -def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): + +def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None, + retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 网页请求核心函数 @@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: raise Exception('Connect Failed') -def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): +def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, + retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 是否使用代理应由上层处理 """ @@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_ raise Exception('Connect Failed') -# -# TODO: 以下临时使用,更新完各站后,再更新 -# - - class TimeoutHTTPAdapter(HTTPAdapter): def __init__(self, *args, **kwargs): self.timeout = G_DEFAULT_TIMEOUT @@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter): return super().send(request, **kwargs) -# with keep-alive feature -# storyline carib gcolle javdb only -def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None, - encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): +def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): + """ + keep-alive + """ session = requests.Session() retries = Retry(total=retry, connect=retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) @@ -110,27 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ if proxies: session.proxies = proxies session.headers = {"User-Agent": ua or G_USER_AGENT} - try: - if isinstance(url, str) and len(url): - result = session.get(str(url)) - else: # 空url参数直接返回可重用session对象,无需设置return_type - return session - if not result.ok: - return None - if return_type == "object": - return result - elif return_type == "content": - return result.content - elif return_type == "session": - return result, session - else: - result.encoding = encoding or "utf-8" - return result.text - except requests.exceptions.ProxyError: - print("[-]get_html_session() Proxy error! Please check your Proxy") - except Exception as e: - print(f"[-]get_html_session() failed. {e}") - return None + return session + # storyline only # 使用 cloudscraper.... diff --git a/scrapinglib/imdb.py b/scrapinglib/imdb.py new file mode 100644 index 0000000..7aab483 --- /dev/null +++ b/scrapinglib/imdb.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + + +from .parser import Parser + + +class Imdb(Parser): + source = 'imdb' + imagecut = 0 + + expr_title = '//h1[@data-testid="hero-title-block__title"]/text()' + expr_release = '//a[contains(text(),"Release date")]/following-sibling::div[1]/ul/li/a/text()' + expr_cover = '//head/meta[@property="og:image"]/@content' + expr_outline = '//head/meta[@property="og:description"]/@content' + expr_actor = '//h3[contains(text(),"Top cast")]/../../../following-sibling::div[1]/div[2]/div/div/a/text()' + expr_tags = '//div[@data-testid="genres"]/div[2]/a/ul/li/text()' + + def queryNumberUrl(self, number): + """ + TODO 区分 ID 与 名称 + """ + id = number + movieUrl = "https://www.imdb.com/title/" + id + return movieUrl diff --git a/scrapinglib/jav321.py b/scrapinglib/jav321.py index 31126f4..babdee2 100644 --- a/scrapinglib/jav321.py +++ b/scrapinglib/jav321.py @@ -26,6 +26,14 @@ class Jav321(Parser): return 'https://www.jav321.com/search' def getHtmlTree(self, url): + """ + 特殊处理 仅获取页面调用一次 + """ + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + resp = httprequest.get(self.detailurl, cookies=self.cookies, proxies=self.proxies, verify=self.verify) + self.detailhtml = resp + return etree.fromstring(resp, etree.HTMLParser()) resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify) if "/video/" in resp.url: self.detailurl = resp.url diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index bb6e978..8e52de1 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -32,7 +32,12 @@ class Javbus(Parser): def search(self, number): self.number = number - try: + try: + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + htmltree = self.getHtmlTree(self.detailurl) + result = self.dictformat(htmltree) + return result url = "https://www." + secrets.choice([ 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', 'cdnbus.fun', @@ -61,7 +66,10 @@ class Javbus(Parser): self.uncensored = True w_number = number.replace('.', '-') - self.detailurl = 'https://www.javbus.red/' + w_number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://www.javbus.red/' + w_number self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: return 404 diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index dae5092..3cacd05 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -4,7 +4,7 @@ import re from urllib.parse import urljoin from lxml import etree -from .httprequest import get_html_session +from .httprequest import request_session from .parser import Parser @@ -63,8 +63,11 @@ class Javdb(Parser): def search(self, number: str): self.number = number - self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) - self.detailurl = self.queryNumberUrl(number) + self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(number) self.deatilpage = self.session.get(self.detailurl).text if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage: self.noauth = True @@ -193,19 +196,19 @@ class Javdb(Parser): def getUserRating(self, htmltree): try: - result = str(self.getTreeElement(htmltree, self.expr_userrating)) - v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) - return float(v[0][0]) + numstrs = self.getTreeElement(htmltree, self.expr_userrating) + nums = re.findall('[0-9.]+', numstrs) + return float(nums[0]) except: - return + return '' def getUserVotes(self, htmltree): try: - result = str(self.getTreeElement(htmltree, self.expr_uservotes)) - v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) - return int(v[0][1]) + result = self.getTreeElement(htmltree, self.expr_uservotes) + v = re.findall('[0-9.]+', result) + return int(v[1]) except: - return + return '' def getaphoto(self, url, session): html_page = session.get(url).text diff --git a/scrapinglib/javlibrary.py b/scrapinglib/javlibrary.py new file mode 100644 index 0000000..782fa22 --- /dev/null +++ b/scrapinglib/javlibrary.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +from lxml import etree +from .httprequest import request_session +from .parser import Parser + + +class Javlibrary(Parser): + source = 'javlibrary' + + htmltree = None + + expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()' + expr_title = '//div[@id="video_title"]/h3/a/text()' + expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()' + expr_tags = '//div[@id="video_genres"]/table/tr/td[@class="text"]/span/a/text()' + expr_cover = '//img[@id="video_jacket_img"]/@src' + expr_release = '//div[@id="video_date"]/table/tr/td[@class="text"]/text()' + expr_studio = '//div[@id="video_maker"]/table/tr/td[@class="text"]/span/a/text()' + expr_runtime = '//div[@id="video_length"]/table/tr/td/span[@class="text"]/text()' + expr_userrating = '//div[@id="video_review"]/table/tr/td/span[@class="score"]/text()' + expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()' + expr_extrafanart = '//div[@class="previewthumbs"]/img/@src' + + def updateCore(self, core): + if core.proxies: + self.proxies = core.proxies + if core.verify: + self.verify = core.verify + if core.morestoryline: + self.morestoryline = True + self.cookies = {'over18':'1'} + + def search(self, number): + self.number = number.upper() + self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(self.number) + if not self.detailurl: + return 404 + if self.htmltree is None: + deatils = self.session.get(self.detailurl) + self.htmltree = etree.fromstring(deatils.text, etree.HTMLParser()) + result = self.dictformat(self.htmltree) + return result + + def queryNumberUrl(self, number:str): + queryUrl = "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + number + queryResult = self.session.get(queryUrl) + + if queryResult and "/?v=jav" in queryResult.url: + self.htmltree = etree.fromstring(queryResult.text, etree.HTMLParser()) + return queryResult.url + else: + queryTree = etree.fromstring(queryResult.text, etree.HTMLParser()) + numbers = queryTree.xpath('//div[@class="id"]/text()') + if number in numbers: + urls = queryTree.xpath('//div[@class="id"]/../@href') + detailurl = urls[numbers.index(number)] + return "http://www.javlibrary.com/cn" + detailurl.strip('.') + return None + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + title = title.replace(self.getNum(htmltree), '').strip() + return title + + def getCover(self, htmltree): + url = super().getCover(htmltree) + if not url.startswith('http'): + url = 'https:' + url + return url + + def getOutline(self, htmltree): + if self.morestoryline: + from .storyline import getStoryline + return getStoryline(self.number, self.getUncensored(htmltree)) + return '' diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index f132c22..f3ffd20 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -8,6 +8,7 @@ from .parser import Parser class Madou(Parser): source = 'madou' + imagecut = 0 uncensored = True expr_url = '//a[@class="share-weixin"]/@data-url' @@ -17,7 +18,10 @@ class Madou(Parser): def search(self, number): self.number = number.lower().strip() - self.detailurl = "https://madou.club/" + number + ".html" + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = "https://madou.club/" + number + ".html" self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: return 404 diff --git a/scrapinglib/mgstage.py b/scrapinglib/mgstage.py index a09540b..de279fc 100644 --- a/scrapinglib/mgstage.py +++ b/scrapinglib/mgstage.py @@ -25,7 +25,10 @@ class Mgstage(Parser): def search(self, number): self.number = number.upper() self.cookies = {'adc':'1'} - self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' htmltree =self.getHtmlTree(self.detailurl) result = self.dictformat(htmltree) return result diff --git a/scrapinglib/mv91.py b/scrapinglib/mv91.py index cf10144..7d589b1 100644 --- a/scrapinglib/mv91.py +++ b/scrapinglib/mv91.py @@ -8,6 +8,8 @@ from .parser import Parser class Mv91(Parser): source = 'mv91' + imagecut = 0 + uncensored = True expr_number = '//div[@class="player-title"]/text()' expr_title = '//div[@class="player-title"]/text()' @@ -53,8 +55,8 @@ class Mv91(Parser): result = str(finds[0][0]) else: result = ' '.join(title.replace('/',' ').split()) - result = result.split()[0].replace('「预告」','') - return result.strip() + result = result.split()[0] + return result.replace('「预告」','').strip('/ ') except: return '' diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index a861c88..fa25d4e 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -11,7 +11,10 @@ class Parser: """ 基础刮削类 """ source = 'base' - # poster: `0` 复制 `1` 裁剪 + # 推荐剪切poster封面: + # `0` 复制cover + # `1` 裁剪cover + # `3` 下载小封面 imagecut = 1 uncensored = False allow_number_change = False @@ -21,6 +24,7 @@ class Parser: extraheader = None cookies = None morestoryline = False + specifiedUrl = None number = '' detailurl = '' @@ -61,8 +65,19 @@ class Parser: return result def search(self, number): + """ 查询番号 + + 查询主要流程: + 1. 获取 url + 2. 获取详情页面 + 3. 解析 + 4. 返回 result + """ self.number = number - self.detailurl = self.queryNumberUrl(number) + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(number) htmltree = self.getHtmlTree(self.detailurl) result = self.dictformat(htmltree) return result @@ -79,13 +94,16 @@ class Parser: self.verify = core.verify if core.morestoryline: self.morestoryline = True + if core.specifiedSource == self.source: + self.specifiedUrl = core.specifiedUrl def queryNumberUrl(self, number): """ 根据番号查询详细信息url + 需要针对不同站点修改,或者在上层直接获取 备份查询页面,预览图可能需要 """ - url = httprequest.get(number) + url = "http://detailurl.ai/" + number return url def getHtml(self, url, type = None): @@ -115,26 +133,26 @@ class Parser: 'number': self.getNum(htmltree), 'title': self.getTitle(htmltree), 'studio': self.getStudio(htmltree), + 'release': self.getRelease(htmltree), 'year': self.getYear(htmltree), 'outline': self.getOutline(htmltree), 'runtime': self.getRuntime(htmltree), 'director': self.getDirector(htmltree), 'actor': self.getActors(htmltree), - 'release': self.getRelease(htmltree), + 'actor_photo': self.getActorPhoto(htmltree), 'cover': self.getCover(htmltree), 'cover_small': self.getSmallCover(htmltree), 'extrafanart': self.getExtrafanart(htmltree), 'trailer': self.getTrailer(htmltree), - 'imagecut': self.imagecut, 'tag': self.getTags(htmltree), 'label': self.getLabel(htmltree), - 'actor_photo': self.getActorPhoto(htmltree), + 'series': self.getSeries(htmltree), + 'userrating': self.getUserRating(htmltree), + 'uservotes': self.getUserVotes(htmltree), + 'uncensored': self.getUncensored(htmltree), 'website': self.detailurl, 'source': self.source, - 'series': self.getSeries(htmltree), - 'uncensored': self.getUncensored(htmltree), - 'userrating': self.getUserRating(htmltree), - 'uservotes': self.getUserVotes(htmltree) + 'imagecut': self.getImagecut(htmltree), } dic = self.extradict(dic) except Exception as e: @@ -215,11 +233,26 @@ class Parser: else: return self.uncensored + def getImagecut(self, htmlree): + """ 修正 无码poster不裁剪cover + """ + if self.imagecut == 1 and self.getUncensored(htmlree): + self.imagecut = 0 + return self.imagecut + def getUserRating(self, htmltree): - return self.getTreeElement(htmltree, self.expr_userrating) + numstrs = self.getTreeElement(htmltree, self.expr_userrating) + nums = re.findall('[0-9.]+', numstrs) + if len(nums) == 1: + return float(nums[0]) + return '' def getUserVotes(self, htmltree): - return self.getTreeElement(htmltree, self.expr_uservotes) + votestrs = self.getTreeElement(htmltree, self.expr_uservotes) + votes = re.findall('[0-9]+', votestrs) + if len(votes) == 1: + return int(votes[0]) + return '' def getTreeElement(self, tree: html.HtmlElement, expr, index=0): """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 diff --git a/scrapinglib/storyline.py b/scrapinglib/storyline.py index 458388f..2194c76 100644 --- a/scrapinglib/storyline.py +++ b/scrapinglib/storyline.py @@ -13,7 +13,7 @@ import builtins from urllib.parse import urljoin from lxml.html import fromstring from multiprocessing.dummy import Pool as ThreadPool -from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session +from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session # 舍弃 Amazon 源 G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"} @@ -112,7 +112,8 @@ def getStoryline_airav(number, debug): try: site = secrets.choice(('airav.cc','airav4.club')) url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' - res, session = get_html_session(url, return_type='session') + session = request_session() + res = session.get(url) if not res: raise ValueError(f"get_html_by_session('{url}') failed") lx = fromstring(res.text) diff --git a/scrapinglib/tmdb.py b/scrapinglib/tmdb.py index 1b8e02b..0856b79 100644 --- a/scrapinglib/tmdb.py +++ b/scrapinglib/tmdb.py @@ -13,10 +13,10 @@ class Tmdb(Parser): imagecut = 0 apikey = None - expr_title = '//head/meta[@property="og:title"]' + expr_title = '//head/meta[@property="og:title"]/@content' expr_release = '//div/span[@class="release"]/text()' - expr_cover = '//head/meta[@property="og:image"]' - expr_outline = '//head/meta[@property="og:description"]' + expr_cover = '//head/meta[@property="og:image"]/@content' + expr_outline = '//head/meta[@property="og:description"]/@content' # def search(self, number): # self.detailurl = self.queryNumberUrl(number) @@ -30,11 +30,6 @@ class Tmdb(Parser): movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN" return movieUrl - def getTitle(self, htmltree): - return self.getTreeElement(htmltree, self.expr_title).get('content') - def getCover(self, htmltree): - return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content') + return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover) - def getOutline(self, htmltree): - return self.getTreeElement(htmltree, self.expr_outline).get('content') diff --git a/scrapinglib/xcity.py b/scrapinglib/xcity.py index 2be48f6..36230bb 100644 --- a/scrapinglib/xcity.py +++ b/scrapinglib/xcity.py @@ -13,6 +13,9 @@ class Xcity(Parser): expr_number = '//*[@id="hinban"]/text()' expr_title = '//*[@id="program_detail_title"]/text()' + expr_actor = '//ul/li[@class="credit-links"]/a/text()' + expr_actor_link = '//ul/li[@class="credit-links"]/a' + expr_actorphoto = '//div[@class="frame"]/div/p/img/@src' expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()' expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()' expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()' @@ -23,6 +26,7 @@ class Xcity(Parser): expr_director = '//*[@id="program_detail_director"]/text()' expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()" expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()" + expr_extrafanart = '//div[@id="sample_images"]/div/a/@href' def getStudio(self, htmltree): return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') @@ -57,41 +61,29 @@ class Xcity(Parser): return getStoryline(self.number, uncensored=False) return '' - def getActors(self, htmltree): - htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') - t = [] - for i in htmla: - t.append(i.text.strip()) - return t - def getActorPhoto(self, htmltree): - htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') - t = {i.text.strip(): i['href'] for i in htmla} + treea = self.getTreeAll(htmltree, self.expr_actor_link) + t = {i.text.strip(): i.attrib['href'] for i in treea} o = {} for k, v in t.items(): - r = self.browser.open_relative(v) - if not r.ok: - continue - pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') - if 'noimage.gif' in pic['src']: - continue - o[k] = urljoin(self.browser.url, pic['src']) + actorpageUrl = "https://xcity.jp" + v + try: + adtree = self.getHtmlTree(actorpageUrl) + picUrl = self.getTreeElement(adtree, self.expr_actorphoto) + if 'noimage.gif' in picUrl: + continue + o[k] = urljoin("https://xcity.jp", picUrl) + except: + pass return o def getExtrafanart(self, htmltree): - html_pather = re.compile(r'
[\s\S]*?
') - html = html_pather.search(self.detail_page) - if html: - html = html.group() - extrafanart_pather = re.compile(r' Date: Thu, 28 Jul 2022 18:47:41 +0800 Subject: [PATCH 5/6] support specifiedUrl when scraping single movie --- Movie_Data_Capture.py | 16 ++++++++++------ core.py | 4 ++-- scraper.py | 6 ++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Movie_Data_Capture.py b/Movie_Data_Capture.py index 10888a1..85b72d8 100644 --- a/Movie_Data_Capture.py +++ b/Movie_Data_Capture.py @@ -83,6 +83,8 @@ def argparse_function(ver: str) -> typing.Tuple[str, str, str, str, bool, bool]: help="""Only show job list of files and numbers, and **NO** actual operation is performed. It may help you correct wrong numbers before real job.""") parser.add_argument("-v", "--version", action="version", version=ver) + parser.add_argument("-ss", "--specified-source", default='', nargs='?', help="specified Source.") + parser.add_argument("-su", "--specified-url", default='', nargs='?', help="specified Url.") args = parser.parse_args() @@ -120,7 +122,7 @@ is performed. It may help you correct wrong numbers before real job.""") if no_net_op: conf.set_override("common:stop_counter=0;rerun_delay=0s;face:aways_imagecut=1") - return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op + return args.file, args.number, args.logdir, args.regexstr, args.zero_op, no_net_op, args.specified_source, args.specified_url class OutLogger(object): @@ -487,13 +489,13 @@ def create_data_and_move(movie_path: str, zero_op: bool, no_net_op: bool, oCC): print('[!]', err) -def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC): +def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC, specified_source, specified_url): conf = config.getInstance() file_name = os.path.basename(file_path) try: print("[!] [{1}] As Number Processing for '{0}'".format(file_path, custom_number)) if custom_number: - core_main(file_path, custom_number, oCC) + core_main(file_path, custom_number, oCC, specified_source, specified_url) else: print("[-] number empty ERROR") print("[*]======================================================") @@ -513,7 +515,7 @@ def create_data_and_move_with_custom_number(file_path: str, custom_number, oCC): def main(args: tuple) -> Path: - (single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op) = args + (single_file_path, custom_number, logdir, regexstr, zero_op, no_net_op, specified_source, specified_url) = args conf = config.getInstance() main_mode = conf.main_mode() folder_path = "" @@ -609,9 +611,11 @@ def main(args: tuple) -> Path: print('[+]==================== Single File =====================') if custom_number == '': create_data_and_move_with_custom_number(single_file_path, - get_number(conf.debug(), os.path.basename(single_file_path)), oCC) + get_number(conf.debug(), os.path.basename(single_file_path)), oCC, + specified_source, specified_url) else: - create_data_and_move_with_custom_number(single_file_path, custom_number, oCC) + create_data_and_move_with_custom_number(single_file_path, custom_number, oCC, + specified_source, specified_url) else: folder_path = conf.source_folder() if not isinstance(folder_path, str) or folder_path == '': diff --git a/core.py b/core.py index dcfd173..5eb2ef8 100644 --- a/core.py +++ b/core.py @@ -760,7 +760,7 @@ def core_main_no_net_op(movie_path, number): linkImage(path, number, part, leak_word, c_word, hack_word, ext) -def core_main(movie_path, number_th, oCC): +def core_main(movie_path, number_th, oCC, specified_source=None, specified_url=None): conf = config.getInstance() # =======================================================================初始化所需变量 multi_part = 0 @@ -775,7 +775,7 @@ def core_main(movie_path, number_th, oCC): # 下面被注释的变量不需要 #rootpath= os.getcwd number = number_th - json_data = get_data_from_json(number, oCC) # 定义番号 + json_data = get_data_from_json(number, oCC, specified_source, specified_url) # 定义番号 # Return if blank dict returned (data not found) if not json_data: diff --git a/scraper.py b/scraper.py index ccfb041..f8bbc06 100644 --- a/scraper.py +++ b/scraper.py @@ -7,7 +7,7 @@ from pathlib import Path from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate from scrapinglib.api import search -def get_data_from_json(file_number, oCC): +def get_data_from_json(file_number, oCC, specified_source, specified_url): """ iterate through all services and fetch the data 从JSON返回元数据 """ @@ -51,9 +51,11 @@ def get_data_from_json(file_number, oCC): cacert =None if conf.cacert_file(): cacert = conf.cacert_file() + json_data = search(file_number, sources, proxies=proxies, verify=cacert, dbsite=javdb_site, dbcookies=javdb_cookies, - morestoryline=conf.is_storyline()) + morestoryline=conf.is_storyline(), + specifiedSource=specified_source, specifiedUrl=specified_url) # Return if data not found in all sources if not json_data: print('[-]Movie Number not found!') From 6de2e8f60fd8b1eec6d5c2cb60c1233c90e39978 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Thu, 28 Jul 2022 23:07:51 +0800 Subject: [PATCH 6/6] fix storyline --- scrapinglib/avsox.py | 2 +- scrapinglib/carib.py | 3 +- scrapinglib/httprequest.py | 42 +---------------- scrapinglib/javbus.py | 3 +- scrapinglib/javdb.py | 3 +- scrapinglib/javlibrary.py | 3 +- scrapinglib/parser.py | 2 + scrapinglib/storyline.py | 94 +++++++++++++++----------------------- scrapinglib/xcity.py | 45 ++++++------------ 9 files changed, 61 insertions(+), 136 deletions(-) diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index 9cb5213..9c324a6 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -63,7 +63,7 @@ class Avsox(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - return getStoryline(self.number) + return getStoryline(self.number, proxies=self.proxies, verify=self.verify) return '' def getActors(self, htmltree): diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index decaba6..cc04ae7 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -92,7 +92,8 @@ class Carib(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - result = getStoryline(self.number, uncensored=self.uncensored) + result = getStoryline(self.number, uncensored=self.uncensored, + proxies=self.proxies, verify=self.verify) if len(result): return result return super().getOutline(htmltree) diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py index 7e99819..e987d63 100644 --- a/scrapinglib/httprequest.py +++ b/scrapinglib/httprequest.py @@ -44,7 +44,7 @@ def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: s raise Exception('Connect Failed') -def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, +def post(url: str, data: dict=None, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 是否使用代理应由上层处理 @@ -109,46 +109,6 @@ def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEF return session -# storyline only -# 使用 cloudscraper.... -def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None, - encoding: str = None, use_scraper: bool = False, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): - session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session() - if isinstance(cookies, dict) and len(cookies): - requests.utils.add_dict_to_cookiejar(session.cookies, cookies) - retries = Retry(total=retry, connect=retry, backoff_factor=1, - status_forcelist=[429, 500, 502, 503, 504]) - session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) - session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout)) - if verify: - session.verify = verify - if proxies: - session.proxies = proxies - try: - browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session) - if isinstance(url, str) and len(url): - result = browser.open(url) - else: - return browser - if not result.ok: - return None - - if return_type == "object": - return result - elif return_type == "content": - return result.content - elif return_type == "browser": - return result, browser - else: - result.encoding = encoding or "utf-8" - return result.text - except requests.exceptions.ProxyError: - print("[-]get_html_by_browser() Proxy error! Please check your Proxy") - except Exception as e: - print(f'[-]get_html_by_browser() Failed! {e}') - return None - # storyline xcity only def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None, encoding: str = None, diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index 8e52de1..eb559c0 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -136,5 +136,6 @@ class Javbus(Parser): if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 from .storyline import getStoryline - return getStoryline(self.number , uncensored = self.uncensored) + return getStoryline(self.number , uncensored = self.uncensored, + proxies=self.proxies, verify=self.verify) return '' diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index 3cacd05..c21a819 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -176,7 +176,8 @@ class Javdb(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - return getStoryline(self.number, self.getUncensored(htmltree)) + return getStoryline(self.number, self.getUncensored(htmltree), + proxies=self.proxies, verify=self.verify) return '' def getTrailer(self, htmltree): diff --git a/scrapinglib/javlibrary.py b/scrapinglib/javlibrary.py index 782fa22..b2c7d19 100644 --- a/scrapinglib/javlibrary.py +++ b/scrapinglib/javlibrary.py @@ -76,5 +76,6 @@ class Javlibrary(Parser): def getOutline(self, htmltree): if self.morestoryline: from .storyline import getStoryline - return getStoryline(self.number, self.getUncensored(htmltree)) + return getStoryline(self.number, self.getUncensored(htmltree), + proxies=self.proxies, verify=self.verify) return '' diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index fa25d4e..90670ff 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -88,6 +88,8 @@ class Parser: 针对需要传递的参数: cookies, proxy等 子类继承后修改 """ + if not core: + return if core.proxies: self.proxies = core.proxies if core.verify: diff --git a/scrapinglib/storyline.py b/scrapinglib/storyline.py index 2194c76..306789a 100644 --- a/scrapinglib/storyline.py +++ b/scrapinglib/storyline.py @@ -5,6 +5,7 @@ """ +import json import os import re import time @@ -13,7 +14,10 @@ import builtins from urllib.parse import urljoin from lxml.html import fromstring from multiprocessing.dummy import Pool as ThreadPool -from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session + +from scrapinglib.airav import Airav +from scrapinglib.xcity import Xcity +from .httprequest import get_html_by_form, get_html_by_scraper, request_session # 舍弃 Amazon 源 G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"} @@ -35,7 +39,7 @@ class noThread(object): # 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 -def getStoryline(number, title = None, sites: list=None, uncensored=None): +def getStoryline(number, title=None, sites: list=None, uncensored=None, proxies=None, verify=None): start_time = time.time() debug = False storyine_sites = "1:avno1,4:airavwiki".split(',') @@ -52,7 +56,7 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None): r_dup.add(ns) sort_sites.sort() apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites] - mp_args = ((site, number, title, debug) for site in apply_sites) + mp_args = ((site, number, title, debug, proxies, verify) for site in apply_sites) cores = min(len(apply_sites), os.cpu_count()) if cores == 0: return '' @@ -79,24 +83,21 @@ def getStoryline(number, title = None, sites: list=None, uncensored=None): def getStoryline_mp(args): - (site, number, title, debug) = args + (site, number, title, debug, proxies, verify) = args start_time = time.time() storyline = None if not isinstance(site, str): return storyline elif site == "airavwiki": - storyline = getStoryline_airavwiki(number, debug) - #storyline = getStoryline_airavwiki_super(number, debug) + storyline = getStoryline_airavwiki(number, debug, proxies, verify) elif site == "airav": - storyline = getStoryline_airav(number, debug) + storyline = getStoryline_airav(number, debug, proxies, verify) elif site == "avno1": - storyline = getStoryline_avno1(number, debug) + storyline = getStoryline_avno1(number, debug, proxies, verify) elif site == "xcity": - storyline = getStoryline_xcity(number, debug) - # elif site == "amazon": - # storyline = getStoryline_amazon(title, number, debug) + storyline = getStoryline_xcity(number, debug, proxies, verify) elif site == "58avgo": - storyline = getStoryline_58avgo(number, debug) + storyline = getStoryline_58avgo(number, debug, proxies, verify) if not debug: return storyline print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( @@ -108,11 +109,11 @@ def getStoryline_mp(args): return storyline -def getStoryline_airav(number, debug): +def getStoryline_airav(number, debug, proxies, verify): try: site = secrets.choice(('airav.cc','airav4.club')) url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' - session = request_session() + session = request_session(proxies=proxies, verify=verify) res = session.get(url) if not res: raise ValueError(f"get_html_by_session('{url}') failed") @@ -143,36 +144,16 @@ def getStoryline_airav(number, debug): return None -def getStoryline_airavwiki(number, debug): +def getStoryline_airavwiki(number, debug, proxies, verify): try: kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number - url = f'https://cn.airav.wiki/?search={kwd}' - result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True) - if not result.ok: - raise ValueError(f"get_html_by_browser('{url}','{number}') failed") - s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block') - link = None - for a in s: - title = a.img['title'] - list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip() - if kwd == number: # 番号PRED-164 和 RED-164需要能够区分 - if re.match(f'^{number}$', list_number, re.I): - link = a - break - elif re.search(number, list_number, re.I): - link = a - break - if link is None: - raise ValueError("number not found") - result = browser.follow_link(link) - if not result.ok or not re.search(number, browser.url, re.I): - raise ValueError("detail page not found") - title = browser.page.select('head > title')[0].text.strip() - detail_number = str(re.findall('\[(.*?)]', title)[0]) - if not re.search(number, detail_number, re.I): - raise ValueError(f"detail page number not match, got ->[{detail_number}]") - desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip() - return desc + airavwiki = Airav() + airavwiki.addtion_Javbus = False + airavwiki.proxies = proxies + airavwiki.verify = verify + jsons = airavwiki.search(kwd) + outline = json.loads(jsons).get('outline') + return outline except Exception as e: if debug: print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].") @@ -180,7 +161,7 @@ def getStoryline_airavwiki(number, debug): return '' -def getStoryline_58avgo(number, debug): +def getStoryline_58avgo(number, debug, proxies, verify): try: url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', @@ -189,6 +170,7 @@ def getStoryline_58avgo(number, debug): kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number result, browser = get_html_by_form(url, fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, + proxies=proxies, verify=verify, return_type = 'browser') if not result: raise ValueError(f"get_html_by_form('{url}','{number}') failed") @@ -219,13 +201,13 @@ def getStoryline_58avgo(number, debug): return '' -def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 +def getStoryline_avno1(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得 try: site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc', 'hotav.biz','iqq2.xyz','javhq.tv', 'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',]) url = f'http://{site}/cn/search.php?kw_type=key&kw={number}' - lx = fromstring(get_html_by_scraper(url)) + lx = fromstring(get_html_by_scraper(url, proxies=proxies, verify=verify)) descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description') titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()') if not descs or not len(descs): @@ -246,7 +228,7 @@ def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 return '' -def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 +def getStoryline_avno1OLD(number, debug, proxies, verify): #获取剧情介绍 从avno1.cc取得 try: url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), @@ -255,6 +237,7 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 result, browser = get_html_by_form(url, form_select='div.wrapper > div.header > div.search > form', fields = {'kw' : number}, + proxies=proxies, verify=verify, return_type = 'browser') if not result: raise ValueError(f"get_html_by_form('{url}','{number}') failed") @@ -272,19 +255,14 @@ def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得 return '' -def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 +def getStoryline_xcity(number, debug, proxies, verify): #获取剧情介绍 从xcity取得 try: - xcity_number = number.replace('-','') - query_result, browser = get_html_by_form( - 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), - fields = {'q' : xcity_number.lower()}, - return_type = 'browser') - if not query_result or not query_result.ok: - raise ValueError("page not found") - result = browser.follow_link(browser.links('avod\/detail')[0]) - if not result.ok: - raise ValueError("detail page not found") - return browser.page.select_one('h2.title-detail + p.lead').text.strip() + xcityEngine = Xcity() + xcityEngine.proxies = proxies + xcityEngine.verify = verify + jsons = xcityEngine.search(number) + outline = json.loads(jsons).get('outline') + return outline except Exception as e: if debug: print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") diff --git a/scrapinglib/xcity.py b/scrapinglib/xcity.py index 36230bb..05beb39 100644 --- a/scrapinglib/xcity.py +++ b/scrapinglib/xcity.py @@ -3,7 +3,6 @@ import re import secrets from urllib.parse import urljoin -from lxml import etree from .httprequest import get_html_by_form from .parser import Parser @@ -27,6 +26,19 @@ class Xcity(Parser): expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()" expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()" expr_extrafanart = '//div[@id="sample_images"]/div/a/@href' + expr_outline = '//head/meta[@property="og:description"]/@content' + + def queryNumberUrl(self, number): + xcity_number = number.replace('-','') + query_result, browser = get_html_by_form( + 'https://xcity.jp/' + secrets.choice(['sitemap/','policy/','law/','help/','main/']), + fields = {'q' : xcity_number.lower()}, + cookies=self.cookies, proxies=self.proxies, verify=self.verify, + return_type = 'browser') + if not query_result or not query_result.ok: + raise ValueError("xcity.py: page not found") + prelink = browser.links('avod\/detail')[0]['href'] + return urljoin('https://xcity.jp', prelink) def getStudio(self, htmltree): return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') @@ -55,12 +67,6 @@ class Xcity(Parser): except: return '' - def getOutline(self, htmltree): - if self.morestoryline: - from .storyline import getStoryline - return getStoryline(self.number, uncensored=False) - return '' - def getActorPhoto(self, htmltree): treea = self.getTreeAll(htmltree, self.expr_actor_link) t = {i.text.strip(): i.attrib['href'] for i in treea} @@ -84,28 +90,3 @@ class Xcity(Parser): i = "https:" + i extrafanart.append(i) return extrafanart - - def open_by_browser(self, number): - xcity_number = number.replace('-','') - query_result, browser = get_html_by_form( - 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), - fields = {'q' : xcity_number.lower()}, - return_type = 'browser') - if not query_result or not query_result.ok: - raise ValueError("xcity.py: page not found") - result = browser.follow_link(browser.links('avod\/detail')[0]) - if not result.ok: - raise ValueError("xcity.py: detail page not found") - return str(browser.page), browser - - def search(self, number): - self.number = number - if self.specifiedUrl: - self.detailurl = self.specifiedUrl - lx = self.getHtmlTree(self.detailurl) - else: - self.detail_page, self.browser = self.open_by_browser(number) - self.detailurl = self.browser.url - lx = etree.fromstring(self.detail_page, etree.HTMLParser()) - result = self.dictformat(lx) - return result