add scrapinglib
This commit is contained in:
145
scrapinglib/javbus.py
Normal file
145
scrapinglib/javbus.py
Normal file
@@ -0,0 +1,145 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import os
|
||||
import secrets
|
||||
import inspect
|
||||
from lxml import etree
|
||||
from urllib.parse import urljoin
|
||||
from .parser import Parser
|
||||
|
||||
class Javbus(Parser):
|
||||
|
||||
source = 'javbus'
|
||||
|
||||
expr_number = '/html/head/meta[@name="keywords"]/@content'
|
||||
expr_title = '/html/head/title/text()'
|
||||
expr_studio = '//span[contains(text(),"製作商:")]/../a/text()'
|
||||
expr_studio2 = '//span[contains(text(),"メーカー:")]/../a/text()'
|
||||
expr_director = '//span[contains(text(),"導演:")]/../a/text()'
|
||||
expr_directorJa = '//span[contains(text(),"監督:")]/../a/text()'
|
||||
expr_series = '//span[contains(text(),"系列:")]/../a/text()'
|
||||
expr_series2 = '//span[contains(text(),"シリーズ:")]/../a/text()'
|
||||
expr_label = '//span[contains(text(),"系列:")]/../a/text()'
|
||||
expr_cover = '//a[@class="bigImage"]/@href'
|
||||
expr_release = '/html/body/div[5]/div[1]/div[2]/p[2]/text()'
|
||||
expr_runtime = '/html/body/div[5]/div[1]/div[2]/p[3]/text()'
|
||||
expr_actor = '//div[@class="star-name"]/a'
|
||||
expr_actorphoto = '//div[@class="star-name"]/../a/img'
|
||||
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||
expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]'
|
||||
|
||||
def search(self, number, core: None):
|
||||
|
||||
self.number = number
|
||||
self.updateCore(core)
|
||||
try:
|
||||
url = "https://www." + secrets.choice([
|
||||
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
|
||||
'cdnbus.fun',
|
||||
'dmmbus.fun', 'dmmsee.fun',
|
||||
'fanbus.us',
|
||||
'seedmm.fun',
|
||||
]) + "/"
|
||||
try:
|
||||
self.detailurl = url + number
|
||||
self.htmlcode = self.getHtml(self.detailurl)
|
||||
except:
|
||||
self.detailurl = 'https://www.javbus.com/' + number
|
||||
self.htmlcode = self.getHtml(self.detailurl)
|
||||
if self.htmlcode == 404:
|
||||
return 404
|
||||
htmltree = etree.fromstring(self.htmlcode,etree.HTMLParser())
|
||||
result = self.dictformat(htmltree)
|
||||
return result
|
||||
except:
|
||||
self.searchUncensored(number)
|
||||
|
||||
def searchUncensored(self, number):
|
||||
""" 二次搜索无码
|
||||
"""
|
||||
self.imagecut = 0
|
||||
self.uncensored = True
|
||||
|
||||
w_number = number.replace('.', '-')
|
||||
self.detailurl = 'https://www.javbus.red/' + w_number
|
||||
self.htmlcode = self.getHtml(self.detailurl)
|
||||
if self.htmlcode == 404:
|
||||
return 404
|
||||
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||
result = self.dictformat(htmltree)
|
||||
return result
|
||||
|
||||
def getNum(self, htmltree):
|
||||
return super().getNum(htmltree).split(',')[0]
|
||||
|
||||
def getTitle(self, htmltree):
|
||||
title = super().getTitle(htmltree)
|
||||
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
|
||||
return title
|
||||
|
||||
def getStudio(self, htmltree):
|
||||
if self.uncensored:
|
||||
return self.getTreeIndex(htmltree, self.expr_studio2)
|
||||
else:
|
||||
return self.getTreeIndex(htmltree, self.expr_studio)
|
||||
|
||||
def getCover(self, htmltree):
|
||||
return urljoin("https://www.javbus.com", super().getCover(htmltree))
|
||||
|
||||
def getRelease(self, htmltree):
|
||||
return super().getRelease(htmltree).strip(" ['']")
|
||||
|
||||
def getRuntime(self, htmltree):
|
||||
return super().getRuntime(htmltree).strip(" ['']分鐘")
|
||||
|
||||
def getActors(self, htmltree):
|
||||
actors = super().getActors(htmltree)
|
||||
b=[]
|
||||
for i in actors:
|
||||
b.append(i.attrib['title'])
|
||||
return b
|
||||
|
||||
def getActorPhoto(self, htmltree):
|
||||
actors = super().getActorPhoto(htmltree)
|
||||
d = {}
|
||||
for i in actors:
|
||||
p = i.attrib['src']
|
||||
if "nowprinting.gif" in p:
|
||||
continue
|
||||
t = i.attrib['title']
|
||||
d[t] = urljoin("https://www.javbus.com", p)
|
||||
return d
|
||||
|
||||
def getDirector(self, htmltree):
|
||||
if self.uncensored:
|
||||
return self.getTreeIndex(htmltree, self.expr_directorJa)
|
||||
else:
|
||||
return self.getTreeIndex(htmltree, self.expr_director)
|
||||
|
||||
def getSeries(self, htmltree):
|
||||
if self.uncensored:
|
||||
return self.getTreeIndex(htmltree, self.expr_series2)
|
||||
else:
|
||||
return self.getTreeIndex(htmltree, self.expr_series)
|
||||
|
||||
def getTags(self, htmltree):
|
||||
tags = super().getTags(htmltree).split(',')
|
||||
return tags[1:]
|
||||
|
||||
def getExtrafanart(self, htmltree):
|
||||
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
|
||||
html = html_pather.search(self.htmlcode)
|
||||
if html:
|
||||
html = html.group()
|
||||
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
|
||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||
if extrafanart_imgs:
|
||||
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
|
||||
return ''
|
||||
|
||||
def getOutline(self, htmltree):
|
||||
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
|
||||
return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度
|
||||
from .storyline import getStoryline
|
||||
return getStoryline(self.number , uncensored = self.uncensored)
|
||||
Reference in New Issue
Block a user