Files
AV_Data_Capture/scrapinglib/pissplay.py
2023-02-03 03:40:48 +08:00

88 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
from datetime import datetime
# 搜刮 https://pissplay.com/ 中的视频
# pissplay中的视频没有番号所以要通过文件名搜索
# 只用文件名和网站视频名完全一致时才可以被搜刮
class Pissplay(Parser):
source = 'pissplay'
expr_number = '//*[@id="video_title"]/text()' #这个网站上的视频没有番号,因此用标题代替
expr_title = '//*[@id="video_title"]/text()'
expr_cover = '/html/head//meta[@property="og:image"]/@content'
expr_tags = '//div[@id="video_tags"]/a/text()'
expr_release = '//div[@class="video_date"]/text()'
expr_outline = '//*[@id="video_description"]/p//text()'
def extraInit(self):
self.imagecut = 0 # 不裁剪封面
self.specifiedSource = None
def search(self, number):
self.number = number.strip().upper()
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
newName = re.sub(r"[^a-zA-Z0-9 ]", "", number) # 删除特殊符号
self.detailurl = "https://pissplay.com/videos/" + newName.lower().replace(" ","-") + "/"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
title = self.getTitle(htmltree)
return title
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
title = re.sub(r"[^a-zA-Z0-9 ]", "", title) # 删除特殊符号
return title
def getCover(self, htmltree):
url = super().getCover(htmltree)
if not url.startswith('http'):
url = 'https:' + url
return url
def getRelease(self, htmltree):
releaseDate = super().getRelease(htmltree)
isoData = datetime.strptime(releaseDate, '%d %b %Y').strftime('%Y-%m-%d')
return isoData
def getStudio(self, htmltree):
return 'PissPlay'
def getTags(self, htmltree):
tags = self.getTreeAll(htmltree, self.expr_tags)
if 'Guests' in tags:
if tags[0] == 'Collaboration' or tags[0] == 'Toilet for a Day' or tags[0] == 'Collaboration':
del tags[1]
else:
tags = tags[1:]
return tags
def getActors(self, htmltree) -> list:
tags = self.getTreeAll(htmltree, self.expr_tags)
if 'Guests' in tags:
if tags[0] == 'Collaboration' or tags[0] == 'Toilet for a Day' or tags[0] == 'Collaboration':
return [tags[1]]
else:
return [tags[0]]
else:
return 'Bruce and Morgan'
def getOutline(self, htmltree):
outline = self.getTreeAll(htmltree, self.expr_outline)
if ' Morgan xx' in outline:
num = outline.index(' Morgan xx')
outline = outline[:num]
rstring = ''.join(outline).replace("&","and")
return rstring