Files
AV_Data_Capture/scrapinglib/avsox.py
2022-06-13 10:00:41 +08:00

81 lines
3.1 KiB
Python

# -*- coding: utf-8 -*-
import re
from .parser import Parser
class Avsox(Parser):
source = 'avsox'
imagecut = 3
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
expr_actor = '//a[@class="avatar-box"]'
expr_actorphoto = '//a[@class="avatar-box"]'
expr_title = '/html/body/div[2]/h3/text()'
expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()'
expr_release = '//span[contains(text(),"发行时间:")]/../text()'
expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src'
expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
def queryNumberUrl(self, number):
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
return "https:" + result1
def getNum(self, htmltree):
new_number = self.getTreeElement(htmltree, self.expr_number)
if new_number.upper() != self.number.upper():
raise ValueError('number not found in ' + self.source)
self.number = new_number
return new_number
def getTitle(self, htmltree):
return super().getTitle(htmltree).replace('/', '').strip(self.number)
def getStudio(self, htmltree):
return super().getStudio(htmltree).replace("', '", ' ')
def getSmallCover(self, htmltree):
""" 使用搜索页面的预览小图
"""
return self.getTreeElement(self.searchtree, self.expr_smallcover)
def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',')
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number)
return ''
def getActors(self, htmltree):
a = super().getActors(htmltree)
d = []
for i in a:
d.append(i.find('span').text)
return d
def getActorPhoto(self, htmltree):
a = super().getActorPhoto(htmltree)
d = {}
for i in a:
l = i.find('.//img').attrib['src']
t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d