Files
AV_Data_Capture/scrapinglib/mv91.py
Mathhew 0dda035057 update scrapinglib
- 优化提取extrafanart,trailer等,直接使用xpath expr,不需要正则匹配
- 优化 getchu 获取cover方法,直接使用og标签信息
- 优化 www.getchu 识别 getchu-id 的资源
- 统一获取 tag 方法,返回值 list
2022-06-15 14:23:49 +08:00

91 lines
2.9 KiB
Python

# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Mv91(Parser):
source = 'mv91'
expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()'
expr_release = '//p[@class="date"]/text()'
expr_outline = '//div[@class="play-text"]/text()'
expr_tags = '//div[@class="player-tag"]/text()'
expr_actor = '//p[@class="player-name"]/text()'
def getHtmlTree(self, url, type=None):
self.htmlcode = self.getHtml(url, type)
if self.htmlcode == 404:
return 404
ret = etree.fromstring(self.htmlcode, etree.HTMLParser())
return ret
def queryNumberUrl(self, number):
keyword = number.replace('91CM-','').replace('91MS-','')
search_html = self.getHtml('https://www.91mv.org/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
endurl = html.xpath('//a[@class="video-list"]/@href')[0]
return 'https://www.91mv.org' + endurl
def getNum(self, htmltree):
try:
num = super().getNum(htmltree)
finds = re.findall('(.*)(91.*-\d*)',num)
if finds:
result = str(finds[0][1])
else:
result = ' '.join(num.replace('/',' ').split())
result = result.split()[1]
if self.number.upper() != result.upper():
raise Exception(f'[!] {self.number}: find {result} in mv91, not match')
return result.strip()
except:
return ''
def getTitle(self, htmltree):
try:
title = super().getTitle(htmltree)
finds = re.findall('(.*)(91.*-\d*)',title)
if finds:
result = str(finds[0][0])
else:
result = ' '.join(title.replace('/',' ').split())
result = result.split()[0].replace('「预告」','')
return result.strip()
except:
return ''
def getStudio(self, htmltree):
return '91制片厂'
def getActors(self, htmltree):
b=[]
for player in self.getTreeAll(htmltree, self.expr_actor):
player = player.replace('主演:','')
if '/' in player:
player = player.split('/')[0]
player = re.sub(r'[0-9]+', '', player)
b.append(player)
return b
def getRelease(self, htmltree):
try:
result = super().getRelease(htmltree)
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
pass
return ''
def getCover(self, htmltree):
try:
url = str(re.findall('var pic_url = "(.*?)"', self.htmlcode)[0])
return url.strip()
except:
return ''