update scrapinglib

- support specifiedUrl when scraping single movie
- support javlibrary and rating
This commit is contained in:
Mathhew
2022-07-28 18:45:54 +08:00
parent ee1306fb3b
commit ce388edce8
23 changed files with 379 additions and 176 deletions

View File

@@ -8,6 +8,9 @@ from .javbus import Javbus
class Airav(Parser):
source = 'airav'
# for javbus
specifiedSource = None
addtion_Javbus = True
expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()'
@@ -21,23 +24,38 @@ class Airav(Parser):
def search(self, number):
self.number = number
self.detailurl = 'https://cn.airav.wiki/video/' + number
engine = Javbus()
javbusinfo = engine.scrape(number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.javbus = json.loads(javbusinfo)
self.detailurl = self.queryNumberUrl(self.number)
if self.addtion_Javbus:
engine = Javbus()
javbusinfo = engine.scrape(self.number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
else:
self.javbus = json.loads(javbusinfo)
self.htmlcode = self.getHtml(self.detailurl)
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def queryNumberUrl(self, number):
queryUrl = "https://cn.airav.wiki/?search=" + number
queryTree = self.getHtmlTree(queryUrl)
results = self.getTreeAll(queryTree, '//div[contains(@class,"videoList")]/div/a')
for i in results:
num = self.getTreeElement(i, '//div/div[contains(@class,"videoNumber")]/p[1]/text()')
if num.replace('-','') == number.replace('-','').upper():
self.number = num
return "https://cn.airav.wiki" + i.attrib['href']
return 'https://cn.airav.wiki/video/' + number
def getNum(self, htmltree):
# return super().getNum(htmltree)
result = self.javbus.get('number')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('number')
if isinstance(result, str) and len(result):
return result
number = super().getNum(htmltree)
result = str(re.findall('^\[(.*?)]', number)[0])
return result
@@ -48,24 +66,27 @@ class Airav(Parser):
return result
def getStudio(self, htmltree):
result = self.javbus.get('studio')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('studio')
if isinstance(result, str) and len(result):
return result
return super().getStudio(htmltree)
def getRelease(self, htmltree):
result = self.javbus.get('release')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('release')
if isinstance(result, str) and len(result):
return result
try:
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
except:
return ''
def getYear(self, htmltree):
result = self.javbus.get('year')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('year')
if isinstance(result, str) and len(result):
return result
release = self.getRelease(htmltree)
return str(re.findall('\d{4}', release)).strip(" ['']")
@@ -73,39 +94,40 @@ class Airav(Parser):
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
def getRuntime(self, htmltree):
result = self.javbus.get('runtime')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
def getDirector(self, htmltree):
result = self.javbus.get('director')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getActors(self, htmltree):
b=[]
a = super().getActors(htmltree)
for v in a:
v = v.strip()
if len(v):
b.append(v)
b = [ i.strip() for i in a if len(i)]
if len(b):
return b
result = self.javbus.get('actor')
if isinstance(result, list) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getCover(self, htmltree):
result = self.javbus.get('cover')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('cover')
if isinstance(result, str) and len(result):
return result
return super().getCover(htmltree)
def getSeries(self, htmltree):
result = self.javbus.get('series')
if isinstance(result, str) and len(result):
return result
if self.addtion_Javbus:
result = self.javbus.get('series')
if isinstance(result, str) and len(result):
return result
return ''