From b906be8099bb62a0b350381e81b665ec73e37b6a Mon Sep 17 00:00:00 2001 From: hejianjun Date: Sun, 6 Nov 2022 20:27:55 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=83=E7=94=A891mv,=E6=8D=A2=E7=94=A8?= =?UTF-8?q?=E6=96=B0=E7=BD=91=E7=AB=99javday?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapinglib/api.py | 8 ++-- scrapinglib/javday.py | 43 ++++++++++++++++++++ scrapinglib/mv91.py | 94 ------------------------------------------- 3 files changed, 47 insertions(+), 98 deletions(-) create mode 100644 scrapinglib/javday.py delete mode 100644 scrapinglib/mv91.py diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 3cd8917..83116ee 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -11,7 +11,6 @@ from .gcolle import Gcolle from .getchu import Getchu from .jav321 import Jav321 from .javdb import Javdb -from .mv91 import Mv91 from .fc2 import Fc2 from .madou import Madou from .mgstage import Mgstage @@ -19,6 +18,7 @@ from .javbus import Javbus from .xcity import Xcity from .avsox import Avsox from .javlibrary import Javlibrary +from .javday import Javday from .tmdb import Tmdb from .imdb import Imdb @@ -50,8 +50,8 @@ class Scraping: """ """ adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', - 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91', - 'getchu', 'gcolle' + 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', + 'getchu', 'gcolle','javday' ] adult_func_mapping = { 'avsox': Avsox().scrape, @@ -65,11 +65,11 @@ class Scraping: 'fanza': Fanza().scrape, 'airav': Airav().scrape, 'carib': Carib().scrape, - 'mv91': Mv91().scrape, 'gcolle': Gcolle().scrape, 'javdb': Javdb().scrape, 'getchu': Getchu().scrape, 'javlibrary': Javlibrary().scrape, + 'javday': Javday().scrape } general_full_sources = ['tmdb', 'imdb'] diff --git a/scrapinglib/javday.py b/scrapinglib/javday.py new file mode 100644 index 0000000..01ecc09 --- /dev/null +++ b/scrapinglib/javday.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urlparse, unquote +from .parser import Parser + + +class Javday(Parser): + source = 'javday' + + expr_url = '/html/head/meta[@property="og:url"]/@content' + expr_cover = '/html/head/meta[@property="og:image"]/@content' + expr_tags = '/html/head/meta[@name="keywords"]/@content' + expr_title = "/html/head/title/text()" + expr_actor = "//span[@class='vod_actor']/a/text()" + expr_studio = '//span[@class="producer"]/a/text()' + expr_number = '//span[@class="jpnum"]/text()' + + def extraInit(self): + self.imagecut = 4 + self.uncensored = True + + def search(self, number): + self.number = number.strip().upper() + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = "https://javday.tv/videos/" + self.number.replace("-","") + "/" + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) + self.detailurl = self.getTreeElement(htmltree, self.expr_url) + + result = self.dictformat(htmltree) + return result + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + # 删除番号和网站名 + result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip() + return result diff --git a/scrapinglib/mv91.py b/scrapinglib/mv91.py deleted file mode 100644 index 65a7f7e..0000000 --- a/scrapinglib/mv91.py +++ /dev/null @@ -1,94 +0,0 @@ -# -*- coding: utf-8 -*- - - -import re -from lxml import etree -from .parser import Parser - - -class Mv91(Parser): - source = 'mv91' - - expr_number = '//div[@class="player-title"]/text()' - expr_title = '//div[@class="player-title"]/text()' - expr_release = '//p[@class="date"]/text()' - expr_outline = '//div[@class="play-text"]/text()' - expr_tags = '//div[@class="player-tag"]/text()' - expr_actor = '//p[@class="player-name"]/text()' - - def extraInit(self): - self.imagecut = 0 - self.uncensored = True - - def getHtmlTree(self, url, type=None): - self.htmlcode = self.getHtml(url, type) - if self.htmlcode == 404: - return 404 - ret = etree.fromstring(self.htmlcode, etree.HTMLParser()) - return ret - - def queryNumberUrl(self, number): - keyword = number.replace('91CM-','').replace('91MS-','') - search_html = self.getHtml('https://www.91mv.org/index/search?keywords=' + keyword) - html = etree.fromstring(search_html, etree.HTMLParser()) - endurl = html.xpath('//a[@class="video-list"]/@href')[0] - return 'https://www.91mv.org' + endurl - - def getNum(self, htmltree): - try: - num = super().getNum(htmltree) - finds = re.findall('(.*)(91.*-\d*)',num) - if finds: - result = str(finds[0][1]) - else: - result = ' '.join(num.replace('/',' ').split()) - result = result.split()[1] - if self.number.upper() != result.upper(): - raise Exception(f'[!] {self.number}: find {result} in mv91, not match') - return result.strip() - except: - return '' - - def getTitle(self, htmltree): - try: - title = super().getTitle(htmltree) - finds = re.findall('(.*)(91.*-\d*)',title) - if finds: - result = str(finds[0][0]) - else: - result = ' '.join(title.replace('/',' ').split()) - result = result.split()[0] - return result.replace('「预告」','').strip('/ ') - except: - return '' - - def getStudio(self, htmltree): - return '91制片厂' - - def getActors(self, htmltree): - b=[] - for player in self.getTreeAll(htmltree, self.expr_actor): - player = player.replace('主演:','') - if '/' in player: - player = player.split('/')[0] - player = re.sub(r'[0-9]+', '', player) - b.append(player) - return b - - def getRelease(self, htmltree): - try: - result = super().getRelease(htmltree) - date = result.replace('日期:','') - if isinstance(date, str) and len(date): - return date - except: - pass - return '' - - def getCover(self, htmltree): - try: - url = str(re.findall('var pic_url = "(.*?)"', self.htmlcode)[0]) - return url.strip() - except: - return '' -