add scrapinglib

This commit is contained in:
Mathhew
2022-05-26 14:03:58 +08:00
parent 529aeaddd2
commit b7ecb66210
20 changed files with 2515 additions and 0 deletions

145
scrapinglib/javbus.py Normal file
View File

@@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
import re
import os
import secrets
import inspect
from lxml import etree
from urllib.parse import urljoin
from .parser import Parser
class Javbus(Parser):
source = 'javbus'
expr_number = '/html/head/meta[@name="keywords"]/@content'
expr_title = '/html/head/title/text()'
expr_studio = '//span[contains(text(),"製作商:")]/../a/text()'
expr_studio2 = '//span[contains(text(),"メーカー:")]/../a/text()'
expr_director = '//span[contains(text(),"導演:")]/../a/text()'
expr_directorJa = '//span[contains(text(),"監督:")]/../a/text()'
expr_series = '//span[contains(text(),"系列:")]/../a/text()'
expr_series2 = '//span[contains(text(),"シリーズ:")]/../a/text()'
expr_label = '//span[contains(text(),"系列:")]/../a/text()'
expr_cover = '//a[@class="bigImage"]/@href'
expr_release = '/html/body/div[5]/div[1]/div[2]/p[2]/text()'
expr_runtime = '/html/body/div[5]/div[1]/div[2]/p[3]/text()'
expr_actor = '//div[@class="star-name"]/a'
expr_actorphoto = '//div[@class="star-name"]/../a/img'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]'
def search(self, number, core: None):
self.number = number
self.updateCore(core)
try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
'dmmbus.fun', 'dmmsee.fun',
'fanbus.us',
'seedmm.fun',
]) + "/"
try:
self.detailurl = url + number
self.htmlcode = self.getHtml(self.detailurl)
except:
self.detailurl = 'https://www.javbus.com/' + number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode,etree.HTMLParser())
result = self.dictformat(htmltree)
return result
except:
self.searchUncensored(number)
def searchUncensored(self, number):
""" 二次搜索无码
"""
self.imagecut = 0
self.uncensored = True
w_number = number.replace('.', '-')
self.detailurl = 'https://www.javbus.red/' + w_number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return super().getNum(htmltree).split(',')[0]
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
return title
def getStudio(self, htmltree):
if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_studio2)
else:
return self.getTreeIndex(htmltree, self.expr_studio)
def getCover(self, htmltree):
return urljoin("https://www.javbus.com", super().getCover(htmltree))
def getRelease(self, htmltree):
return super().getRelease(htmltree).strip(" ['']")
def getRuntime(self, htmltree):
return super().getRuntime(htmltree).strip(" ['']分鐘")
def getActors(self, htmltree):
actors = super().getActors(htmltree)
b=[]
for i in actors:
b.append(i.attrib['title'])
return b
def getActorPhoto(self, htmltree):
actors = super().getActorPhoto(htmltree)
d = {}
for i in actors:
p = i.attrib['src']
if "nowprinting.gif" in p:
continue
t = i.attrib['title']
d[t] = urljoin("https://www.javbus.com", p)
return d
def getDirector(self, htmltree):
if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_directorJa)
else:
return self.getTreeIndex(htmltree, self.expr_director)
def getSeries(self, htmltree):
if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_series2)
else:
return self.getTreeIndex(htmltree, self.expr_series)
def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',')
return tags[1:]
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getOutline(self, htmltree):
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
from .storyline import getStoryline
return getStoryline(self.number , uncensored = self.uncensored)