From f11378186de02420163151e07d0c38d10e7d20f1 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Mon, 13 Jun 2022 09:02:05 +0800 Subject: [PATCH] update lib --- scrapinglib/api.py | 74 ++++++++++++++++++++++++++++++-------- scrapinglib/fanza.py | 11 ++++-- scrapinglib/httprequest.py | 12 +++---- scrapinglib/javdb.py | 40 ++++++++++++++++----- scrapinglib/parser.py | 20 +++-------- scrapinglib/utils.py | 31 ++++++++++++++++ 6 files changed, 140 insertions(+), 48 deletions(-) create mode 100644 scrapinglib/utils.py diff --git a/scrapinglib/api.py b/scrapinglib/api.py index cbe9138..ba85acf 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -57,10 +57,10 @@ class Scraping(): """ - full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', + adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', 'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91', 'gcolle', 'javdb', 'getchu'] - func_mapping = { + adult_func_mapping = { 'avsox': Avsox().scrape, 'javbus': Javbus().scrape, 'xcity': Xcity().scrape, @@ -78,6 +78,11 @@ class Scraping(): 'getchu': Getchu().scrape, } + general_full_sources = ['tmdb'] + general_func_mapping = { + 'tmdb': Tmdb().scrape, + } + proxies = None verify = None @@ -98,22 +103,45 @@ class Scraping(): else: return self.searchGeneral(number, sources) - def searchGeneral(self, number, sources): + def searchGeneral(self, name, sources): """ 查询电影电视剧 imdb,tmdb """ - data = Tmdb().scrape(number, self) - json_data = json.loads(data) - return json_data - - def searchAdult(self, number, sources): - sources = self.checkSources(sources, number) + sources = self.checkGeneralSources(sources, name) json_data = {} for source in sources: try: print('[+]select', source) try: - data = self.func_mapping[source](number, self) + data = self.general_func_mapping[source](name, self) + if data == 404: + continue + json_data = json.loads(data) + except Exception as e: + print('[!] 出错啦') + print(e) + # if any service return a valid return, break + if self.get_data_state(json_data): + print(f"[+]Find movie [{name}] metadata on website '{source}'") + break + except: + continue + + # Return if data not found in all sources + if not json_data: + print(f'[-]Movie Number [{name}] not found!') + return None + + return json_data + + def searchAdult(self, number, sources): + sources = self.checkAdultSources(sources, number) + json_data = {} + for source in sources: + try: + print('[+]select', source) + try: + data = self.adult_func_mapping[source](number, self) if data == 404: continue json_data = json.loads(data) @@ -135,10 +163,26 @@ class Scraping(): return json_data - - def checkSources(self, c_sources, file_number): + def checkGeneralSources(self, c_sources, name): if not c_sources: - sources = self.full_sources + sources = self.general_full_sources + else: + sources = c_sources.split(',') + + # check sources in func_mapping + todel = [] + for s in sources: + if not s in self.general_func_mapping: + print('[!] Source Not Exist : ' + s) + todel.append(s) + for d in todel: + print('[!] Remove Source : ' + s) + sources.remove(d) + return sources + + def checkAdultSources(self, c_sources, file_number): + if not c_sources: + sources = self.adult_full_sources else: sources = c_sources.split(',') def insert(sources,source): @@ -146,7 +190,7 @@ class Scraping(): sources.insert(0, sources.pop(sources.index(source))) return sources - if len(sources) <= len(self.func_mapping): + if len(sources) <= len(self.adult_func_mapping): # if the input file name matches certain rules, # move some web service to the beginning of the list lo_file_number = file_number.lower() @@ -182,7 +226,7 @@ class Scraping(): # check sources in func_mapping todel = [] for s in sources: - if not s in self.func_mapping: + if not s in self.adult_func_mapping: print('[!] Source Not Exist : ' + s) todel.append(s) for d in todel: diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index c06d26c..b3e5824 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -10,6 +10,7 @@ class Fanza(Parser): source = 'fanza' expr_title = '//*[starts-with(@id, "title")]/text()' + expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" @@ -132,10 +133,16 @@ class Fanza(Parser): return '' def getLabel(self, htmltree): - return self.getFanzaStrings('レーベル') + ret = self.getFanzaStrings('レーベル') + if ret == "----": + return '' + return ret def getSeries(self, htmltree): - return self.getFanzaStrings('シリーズ:') + ret = self.getFanzaStrings('シリーズ:') + if ret == "----": + return '' + return ret def getFanzaString(self, expr): result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']") diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py index 0c677c0..997ff39 100644 --- a/scrapinglib/httprequest.py +++ b/scrapinglib/httprequest.py @@ -23,8 +23,7 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: for i in range(retry): try: result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies, - verify=verify, - cookies=cookies) + verify=verify, cookies=cookies) if return_type == "object": return result elif return_type == "content": @@ -44,8 +43,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: raise Exception('Connect Failed') -def post(url: str, data: dict, cookies = None, ua: str = None, return_type: str = None, encoding: str = None, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): +def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None, + retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 是否使用代理应由上层处理 """ @@ -54,9 +53,8 @@ def post(url: str, data: dict, cookies = None, ua: str = None, return_type: str for i in range(retry): try: - result = requests.post(url, data=data, headers=headers, timeout=timeout, proxies=proxies, - verify=verify, - cookies=cookies) + result = requests.post(url, data=data, files=files, headers=headers, timeout=timeout, proxies=proxies, + verify=verify, cookies=cookies) if return_type == "object": return result elif return_type == "content": diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index a5ad159..839c166 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -4,7 +4,6 @@ import re from urllib.parse import urljoin from lxml import etree -from requests import session from .httprequest import get_html_session from .parser import Parser @@ -13,22 +12,26 @@ class Javdb(Parser): source = 'javdb' fixstudio = False + noauth = False expr_number = '//strong[contains(text(),"番號")]/../span/text()' expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()' expr_title = "/html/head/title/text()" + expr_title_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/text()' expr_runtime = '//strong[contains(text(),"時長")]/../span/text()' expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()' expr_uncensored = '//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?") or contains(@href,"/tags/western?")]' expr_actor = '//span[@class="value"]/a[contains(@href,"/actors/")]/text()' expr_actor2 = '//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class' expr_release = '//strong[contains(text(),"日期")]/../span/text()' + expr_release_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "meta")]/text()' expr_studio = '//strong[contains(text(),"片商")]/../span/a/text()' expr_studio2 = '//strong[contains(text(),"賣家:")]/../span/a/text()' expr_director = '//strong[contains(text(),"導演")]/../span/text()' expr_director2 = '//strong[contains(text(),"導演")]/../span/a/text()' expr_cover = "//div[contains(@class, 'column-video-cover')]/a/img/@src" expr_cover2 = "//div[contains(@class, 'column-video-cover')]/img/@src" + expr_cover_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "cover")]/img/@src' expr_extrafanart = "//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href" expr_tags = '//strong[contains(text(),"類別")]/../span/a/text()' expr_tags2 = '//strong[contains(text(),"類別")]/../span/text()' @@ -57,14 +60,18 @@ class Javdb(Parser): else: self.dbsite = 'javdb' - def search(self, number): + def search(self, number: str): self.number = number self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) self.detailurl = self.queryNumberUrl(number) - self.deatilpage = self.session.get(self.detailurl).text - htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser()) - result = self.dictformat(htmltree) + if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage: + self.noauth = True + self.imagecut = 0 + result = self.dictformat(self.querytree) + else: + htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser()) + result = self.dictformat(htmltree) return result def queryNumberUrl(self, number): @@ -75,18 +82,19 @@ class Javdb(Parser): print(e) raise Exception(f'[!] {self.number}: page not fond in javdb') - htmltree = etree.fromstring(resp.text, etree.HTMLParser()) + self.querytree = etree.fromstring(resp.text, etree.HTMLParser()) # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one - urls = self.getAll(htmltree, '//*[contains(@class,"movie-list")]/div/a/@href') + urls = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href') # 记录一下欧美的ids ['Blacked','Blacked'] if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): correct_url = urls[0] else: - ids = self.getAll(htmltree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()') + ids = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()') try: - correct_url = urls[ids.index(number)] + self.queryid = ids.index(number) + correct_url = urls[self.queryid] except: # 为避免获得错误番号,只要精确对应的结果 if ids[0].upper() != number: @@ -95,6 +103,8 @@ class Javdb(Parser): return urljoin(resp.url, correct_url) def getNum(self, htmltree): + if self.noauth: + return self.number result1 = str(self.getAll(htmltree, self.expr_number)).strip(" ['']") result2 = str(self.getAll(htmltree, self.expr_number2)).strip(" ['']") dp_number = str(result2 + result1).strip('+') @@ -105,10 +115,22 @@ class Javdb(Parser): return self.number def getTitle(self, htmltree): + if self.noauth: + return self.getTreeIndex(htmltree, self.expr_title_no, self.queryid) browser_title = super().getTitle(htmltree) title = browser_title[:browser_title.find(' | JavDB')].strip() return title.replace(self.number, '').strip() + def getCover(self, htmltree): + if self.noauth: + return self.getTreeIndex(htmltree, self.expr_cover_no, self.queryid) + return super().getCover(htmltree) + + def getRelease(self, htmltree): + if self.noauth: + return self.getTreeIndex(htmltree, self.expr_release_no, self.queryid).strip() + return super().getRelease(htmltree) + def getRuntime(self, htmltree): result1 = str(self.getAll(htmltree, self.expr_runtime)).strip(" ['']") result2 = str(self.getAll(htmltree, self.expr_runtime2)).strip(" ['']") diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index c27665c..14493d8 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -3,12 +3,14 @@ import json import re from lxml import etree, html -from . import httprequest +from . import httprequest +from .utils import getTreeElement, getTreeAll class Parser: source = 'base' + # poster: `0` 复制 `1` 裁剪 imagecut = 1 uncensored = False allow_number_change = False @@ -249,21 +251,9 @@ class Parser: def getTreeIndex(self, tree: html.HtmlElement, expr, index=0): """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 """ - if expr == '': - return '' - result = tree.xpath(expr) - try: - return result[index] - except: - return '' + return getTreeElement(tree, expr, index) def getAll(self, tree: html.HtmlElement, expr): """ 根据表达式从`xmltree`中获取全部匹配值 """ - if expr == '': - return '' - result = tree.xpath(expr) - try: - return result - except: - return '' + return getTreeAll(tree, expr) diff --git a/scrapinglib/utils.py b/scrapinglib/utils.py new file mode 100644 index 0000000..490d34f --- /dev/null +++ b/scrapinglib/utils.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +from lxml.html import HtmlElement + +def getTreeElement(tree: HtmlElement, expr, index=0): + """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 + :param tree (html.HtmlElement) + :param expr + :param index + """ + if expr == '': + return '' + result = tree.xpath(expr) + try: + return result[index] + except: + return '' + +def getTreeAll(tree: HtmlElement, expr): + """ 根据表达式从`xmltree`中获取全部匹配值 + :param tree (html.HtmlElement) + :param expr + :param index + """ + if expr == '': + return '' + result = tree.xpath(expr) + try: + return result + except: + return ''