弃用91mv,换用新网站javday

This commit is contained in:
hejianjun
2022-11-06 20:27:55 +08:00
parent c1370e96d8
commit b906be8099
3 changed files with 47 additions and 98 deletions

View File

@@ -11,7 +11,6 @@ from .gcolle import Gcolle
from .getchu import Getchu
from .jav321 import Jav321
from .javdb import Javdb
from .mv91 import Mv91
from .fc2 import Fc2
from .madou import Madou
from .mgstage import Mgstage
@@ -19,6 +18,7 @@ from .javbus import Javbus
from .xcity import Xcity
from .avsox import Avsox
from .javlibrary import Javlibrary
from .javday import Javday
from .tmdb import Tmdb
from .imdb import Imdb
@@ -50,8 +50,8 @@ class Scraping:
"""
"""
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91',
'getchu', 'gcolle'
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou',
'getchu', 'gcolle','javday'
]
adult_func_mapping = {
'avsox': Avsox().scrape,
@@ -65,11 +65,11 @@ class Scraping:
'fanza': Fanza().scrape,
'airav': Airav().scrape,
'carib': Carib().scrape,
'mv91': Mv91().scrape,
'gcolle': Gcolle().scrape,
'javdb': Javdb().scrape,
'getchu': Getchu().scrape,
'javlibrary': Javlibrary().scrape,
'javday': Javday().scrape
}
general_full_sources = ['tmdb', 'imdb']

43
scrapinglib/javday.py Normal file
View File

@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urlparse, unquote
from .parser import Parser
class Javday(Parser):
source = 'javday'
expr_url = '/html/head/meta[@property="og:url"]/@content'
expr_cover = '/html/head/meta[@property="og:image"]/@content'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_title = "/html/head/title/text()"
expr_actor = "//span[@class='vod_actor']/a/text()"
expr_studio = '//span[@class="producer"]/a/text()'
expr_number = '//span[@class="jpnum"]/text()'
def extraInit(self):
self.imagecut = 4
self.uncensored = True
def search(self, number):
self.number = number.strip().upper()
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = "https://javday.tv/videos/" + self.number.replace("-","") + "/"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
self.detailurl = self.getTreeElement(htmltree, self.expr_url)
result = self.dictformat(htmltree)
return result
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
# 删除番号和网站名
result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip()
return result

View File

@@ -1,94 +0,0 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Mv91(Parser):
source = 'mv91'
expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()'
expr_release = '//p[@class="date"]/text()'
expr_outline = '//div[@class="play-text"]/text()'
expr_tags = '//div[@class="player-tag"]/text()'
expr_actor = '//p[@class="player-name"]/text()'
def extraInit(self):
self.imagecut = 0
self.uncensored = True
def getHtmlTree(self, url, type=None):
self.htmlcode = self.getHtml(url, type)
if self.htmlcode == 404:
return 404
ret = etree.fromstring(self.htmlcode, etree.HTMLParser())
return ret
def queryNumberUrl(self, number):
keyword = number.replace('91CM-','').replace('91MS-','')
search_html = self.getHtml('https://www.91mv.org/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
endurl = html.xpath('//a[@class="video-list"]/@href')[0]
return 'https://www.91mv.org' + endurl
def getNum(self, htmltree):
try:
num = super().getNum(htmltree)
finds = re.findall('(.*)(91.*-\d*)',num)
if finds:
result = str(finds[0][1])
else:
result = ' '.join(num.replace('/',' ').split())
result = result.split()[1]
if self.number.upper() != result.upper():
raise Exception(f'[!] {self.number}: find {result} in mv91, not match')
return result.strip()
except:
return ''
def getTitle(self, htmltree):
try:
title = super().getTitle(htmltree)
finds = re.findall('(.*)(91.*-\d*)',title)
if finds:
result = str(finds[0][0])
else:
result = ' '.join(title.replace('/',' ').split())
result = result.split()[0]
return result.replace('「预告」','').strip('/ ')
except:
return ''
def getStudio(self, htmltree):
return '91制片厂'
def getActors(self, htmltree):
b=[]
for player in self.getTreeAll(htmltree, self.expr_actor):
player = player.replace('主演:','')
if '/' in player:
player = player.split('/')[0]
player = re.sub(r'[0-9]+', '', player)
b.append(player)
return b
def getRelease(self, htmltree):
try:
result = super().getRelease(htmltree)
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
pass
return ''
def getCover(self, htmltree):
try:
url = str(re.findall('var pic_url = "(.*?)"', self.htmlcode)[0])
return url.strip()
except:
return ''