Files
AV_Data_Capture/scrapinglib/javday.py
hejianjun 1d46a70eed 1. 动态加载爬虫
2. 修复pyinstaller路径查找子包问题
3. madou的番号处理移动到爬虫内部
4. 过滤javday中多余的tag
2023-03-27 15:37:00 +08:00

46 lines
1.5 KiB
Python

# -*- coding: utf-8 -*-
from lxml import etree
from .parser import Parser
class Javday(Parser):
source = 'javday'
expr_url = '/html/head/meta[@property="og:url"]/@content'
expr_cover = '/html/head/meta[@property="og:image"]/@content'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_title = "/html/head/title/text()"
expr_actor = "//span[@class='vod_actor']/a/text()"
expr_studio = '//span[@class="producer"]/a/text()'
expr_number = '//span[@class="jpnum"]/text()'
def extraInit(self):
self.imagecut = 4
self.uncensored = True
def search(self, number):
self.number = number.strip().upper()
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = "https://javday.tv/videos/" + self.number.replace("-","") + "/"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
self.detailurl = self.getTreeElement(htmltree, self.expr_url)
result = self.dictformat(htmltree)
return result
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
# 删除番号和网站名
result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip()
return result
def getTags(self, htmltree) -> list:
tags = super().getTags(htmltree)
return [tag for tag in tags if 'JAVDAY.TV' not in tag]