From bb37d6ad090c6534f2efc30978f1d1f41532b65d Mon Sep 17 00:00:00 2001 From: Marks Date: Sun, 4 Dec 2022 21:01:19 -0800 Subject: [PATCH] add pissplay --- scraper.py | 17 +++++--- scrapinglib/api.py | 6 ++- scrapinglib/pissplay.py | 87 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 7 deletions(-) create mode 100644 scrapinglib/pissplay.py diff --git a/scraper.py b/scraper.py index 91b4219..2545336 100644 --- a/scraper.py +++ b/scraper.py @@ -100,8 +100,11 @@ def get_data_from_json( # ================================================网站规则添加结束================================================ title = json_data.get('title') - actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表 - actor_list = [actor.strip() for actor in actor_list] # 去除空白 + if json_data['source'] =='pissplay': # pissplay actor为英文名,不用去除空格 + actor_list = [json_data.get('actor')] + else: + actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表 + actor_list = [actor.strip() for actor in actor_list] # 去除空白 director = json_data.get('director') release = json_data.get('release') number = json_data.get('number') @@ -134,11 +137,15 @@ def get_data_from_json( tag.remove('XXXX') while 'xxx' in tag: tag.remove('xxx') - actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') + if json_data['source'] =='pissplay': # pissplay actor为英文名,不用去除空格 + actor = str(actor_list).strip("[ ]").replace("'", '') + else: + actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') if title == '' or number == '': - print('[-]Movie Number or Title not found!') - return None + if json_data['source'] != 'pissplay': # pissplay 没有番号 + print('[-]Movie Number or Title not found!') + return None # if imagecut == '3': # DownloadFileWithFilename() diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 0a9c7fe..c2bb7f6 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -20,6 +20,7 @@ from .xcity import Xcity from .avsox import Avsox from .javlibrary import Javlibrary from .javday import Javday +from .pissplay import Pissplay from .tmdb import Tmdb from .imdb import Imdb @@ -52,7 +53,7 @@ class Scraping: """ adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', - 'getchu', 'gcolle','javday' + 'getchu', 'gcolle','javday','pissplay' ] adult_func_mapping = { 'avsox': Avsox().scrape, @@ -70,7 +71,8 @@ class Scraping: 'javdb': Javdb().scrape, 'getchu': Getchu().scrape, 'javlibrary': Javlibrary().scrape, - 'javday': Javday().scrape + 'javday': Javday().scrape, + 'pissplay': Pissplay().scrape } general_full_sources = ['tmdb', 'imdb'] diff --git a/scrapinglib/pissplay.py b/scrapinglib/pissplay.py new file mode 100644 index 0000000..9298250 --- /dev/null +++ b/scrapinglib/pissplay.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from .parser import Parser +from datetime import datetime + +# 搜刮 https://pissplay.com/ 中的视频 +# pissplay中的视频没有番号,所以要通过文件名搜索 +# 只用文件名和网站视频名完全一致时才可以被搜刮 +class Pissplay(Parser): + source = 'pissplay' + + expr_number = '//*[@id="video_title"]/text()' #这个网站上的视频没有番号,因此用标题代替 + expr_title = '//*[@id="video_title"]/text()' + expr_cover = '/html/head//meta[@property="og:image"]/@content' + expr_tags = '//div[@id="video_tags"]/a/text()' + expr_release = '//div[@class="video_date"]/text()' + expr_outline = '//*[@id="video_description"]/p//text()' + + def extraInit(self): + self.imagecut = 0 # 不裁剪封面 + self.specifiedSource = None + + def search(self, number): + self.number = number.strip().upper() + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + newName = re.sub(r"[^a-zA-Z0-9 ]", "", number) # 删除特殊符号 + self.detailurl = "https://pissplay.com/videos/" + newName.lower().replace(" ","-") + "/" + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + title = self.getTitle(htmltree) + return title + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + title = re.sub(r"[^a-zA-Z0-9 ]", "", title) # 删除特殊符号 + return title + + def getCover(self, htmltree): + url = super().getCover(htmltree) + if not url.startswith('http'): + url = 'https:' + url + return url + + def getRelease(self, htmltree): + releaseDate = super().getRelease(htmltree) + isoData = datetime.strptime(releaseDate, '%d %b %Y').strftime('%Y-%m-%d') + return isoData + + def getStudio(self, htmltree): + return 'PissPlay' + + def getTags(self, htmltree): + tags = self.getTreeAll(htmltree, self.expr_tags) + if 'Guests' in tags: + if tags[0] == 'Collaboration' or tags[0] == 'Toilet for a Day' or tags[0] == 'Collaboration': + del tags[1] + else: + tags = tags[1:] + return tags + + def getActors(self, htmltree) -> list: + tags = self.getTreeAll(htmltree, self.expr_tags) + if 'Guests' in tags: + if tags[0] == 'Collaboration' or tags[0] == 'Toilet for a Day' or tags[0] == 'Collaboration': + return tags[1] + else: + return tags[0] + else: + return 'Bruce and Morgan' + + def getOutline(self, htmltree): + outline = self.getTreeAll(htmltree, self.expr_outline) + if '– Morgan xx' in outline: + num = outline.index('– Morgan xx') + outline = outline[:num] + rstring = ''.join(outline).replace("&","and") + return rstring