add scrapinglib
This commit is contained in:
78
scrapinglib/avsox.py
Normal file
78
scrapinglib/avsox.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
from .parser import Parser
|
||||
|
||||
|
||||
class Avsox(Parser):
|
||||
|
||||
source = 'avsox'
|
||||
imagecut = 3
|
||||
|
||||
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
|
||||
expr_actor = '//a[@class="avatar-box"]'
|
||||
expr_actorphoto = '//a[@class="avatar-box"]'
|
||||
expr_title = '/html/body/div[2]/h3/text()'
|
||||
expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()'
|
||||
expr_release = '//span[contains(text(),"发行时间:")]/../text()'
|
||||
expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src'
|
||||
expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src'
|
||||
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
|
||||
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
|
||||
|
||||
def queryNumberUrl(self, number):
|
||||
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
|
||||
site = self.getTreeIndex(qurySiteTree, '//div[@class="container"]/div/a/@href')
|
||||
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
|
||||
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
|
||||
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
|
||||
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||
return "https:" + result1
|
||||
|
||||
def getNum(self, htmltree):
|
||||
new_number = self.getTreeIndex(htmltree, self.expr_number)
|
||||
if new_number.upper() != self.number.upper():
|
||||
raise ValueError('number not found in ' + self.source)
|
||||
self.number = new_number
|
||||
return new_number
|
||||
|
||||
def getTitle(self, htmltree):
|
||||
return super().getTitle(htmltree).replace('/', '').strip(self.number)
|
||||
|
||||
def getStudio(self, htmltree):
|
||||
return super().getStudio(htmltree).replace("', '", ' ')
|
||||
|
||||
def getSmallCover(self, htmltree):
|
||||
""" 使用搜索页面的预览小图
|
||||
"""
|
||||
return self.getTreeIndex(self.searchtree, self.expr_smallcover)
|
||||
|
||||
def getTags(self, htmltree):
|
||||
tags = super().getTags(htmltree).split(',')
|
||||
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
|
||||
|
||||
def getOutline(self, htmltree):
|
||||
from .storyline import getStoryline
|
||||
return getStoryline(self.number)
|
||||
|
||||
def getActors(self, htmltree):
|
||||
a = super().getActors(htmltree)
|
||||
d = []
|
||||
for i in a:
|
||||
d.append(i.find('span').text)
|
||||
return d
|
||||
|
||||
def getActorPhoto(self, htmltree):
|
||||
a = super().getActorPhoto(htmltree)
|
||||
d = {}
|
||||
for i in a:
|
||||
l = i.find('.//img').attrib['src']
|
||||
t = i.find('span').text
|
||||
p2 = {t: l}
|
||||
d.update(p2)
|
||||
return d
|
||||
Reference in New Issue
Block a user