update lib

fix(airav): tags & extrafanart
fix(mgstage): clean
fix(fanza): outline
This commit is contained in:
Mathhew
2022-06-13 18:24:10 +08:00
parent cd01de1344
commit efb805a987
4 changed files with 28 additions and 71 deletions

View File

@@ -3,7 +3,6 @@
import json import json
import re import re
from lxml import etree from lxml import etree
from bs4 import BeautifulSoup
from .parser import Parser from .parser import Parser
from .javbus import Javbus from .javbus import Javbus
@@ -17,12 +16,14 @@ class Airav(Parser):
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)" expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()' expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src' expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
def search(self, number): def search(self, number):
self.number = number self.number = number
self.detailurl = 'https://cn.airav.wiki/video/' + number self.detailurl = 'https://cn.airav.wiki/video/' + number
engine = Javbus() engine = Javbus()
javbusinfo = engine.search(number, self) javbusinfo = engine.scrape(number, self)
if javbusinfo == 404: if javbusinfo == 404:
self.javbus = {"title": ""} self.javbus = {"title": ""}
else: else:
@@ -103,26 +104,8 @@ class Airav(Parser):
return result return result
return super().getCover(htmltree) return super().getCover(htmltree)
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTags(self, htmltree): def getTags(self, htmltree):
tag = [] return self.getTreeAll(htmltree, self.expr_tags)
soup = BeautifulSoup(self.htmlcode, 'lxml')
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
a = x[0].find_all('a')
for i in a:
tag.append(i.get_text())
return tag
def getSeries(self, htmltree): def getSeries(self, htmltree):
result = self.javbus.get('series') result = self.javbus.get('series')

View File

@@ -11,10 +11,11 @@ class Fanza(Parser):
expr_title = '//*[starts-with(@id, "title")]/text()' expr_title = '//*[starts-with(@id, "title")]/text()'
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
expr_cover = './/head/meta[@property="og:image"]' expr_cover = '//head/meta[@property="og:image"]'
expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_extrafanart = '//a[@name="sample-image"]/img/@src'
expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline = "//div[@class='mg-b20 lh4']/text()"
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
expr_outline_og = '//head/meta[@property="og:description"]'
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
def search(self, number): def search(self, number):
@@ -73,6 +74,8 @@ class Fanza(Parser):
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "") result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
if result == '': if result == '':
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "") result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
return result return result
except: except:
return '' return ''

View File

@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re from .utils import getTreeElement
from lxml import etree
from bs4 import BeautifulSoup
from .parser import Parser from .parser import Parser
@@ -12,7 +10,7 @@ class Mgstage(Parser):
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()' expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
expr_title = '//*[@id="center_column"]/div[1]/h1/text()' expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()' expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
expr_outline = '//p/text()' expr_outline = '//dl[@id="introduction"]/dd/p/text()'
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()' expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()' expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()' expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
@@ -22,60 +20,33 @@ class Mgstage(Parser):
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()' expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()' expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()' expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
expr_extrafanart = '//a[@class="sample_image"]/@href'
def search(self, number): def search(self, number):
self.number = number.upper() self.number = number.upper()
self.cookies = {'adc':'1'} self.cookies = {'adc':'1'}
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
self.htmlcode = self.getHtml(self.detailurl) htmltree =self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
soup = BeautifulSoup(self.htmlcode, 'lxml')
self.detailpage = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
self.htmlcodetree = etree.HTML(self.htmlcode)
self.detailtree = etree.HTML(self.detailpage)
self.introtree = etree.HTML(b2)
result = self.dictformat(self.detailtree)
return result return result
def getTitle(self, htmltree): def getTitle(self, htmltree):
return super().getTitle(self.htmlcodetree).replace('/', ',').replace("\\n",'').replace(' ', '').strip() return super().getTitle(htmltree).replace('/', ',').strip()
def getOutline(self, htmltree):
return super().getOutline(self.introtree).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
def getCover(self, htmltree):
return super().getCover(self.htmlcodetree)
def getTags(self, htmltree): def getTags(self, htmltree):
result1 = str(self.getTreeAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n') results = self.getTreeAll(htmltree, self.expr_tags)
result2 = str(self.getTreeAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n') results2 = self.getTreeAll(htmltree, self.expr_tags2)
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') return [ x.strip() for x in (results + results2) if x.strip()]
return result
def getExtrafanart(self, htmltree): def getTreeAll(self, tree, expr):
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>') alls = super().getTreeAll(tree, expr)
html = html_pather.search(self.htmlcode) return [ x.strip() for x in alls ]
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTreeElement(self, tree, expr, index=0): def getTreeElement(self, tree, expr, index=0):
if expr == '': if expr == '':
return '' return ''
if tree == self.detailtree: result1 = getTreeElement(tree, expr).strip().replace("', '", '').strip(" ['']")
# NOTE: 合并 getMgsString result2 = getTreeElement(tree, expr.replace('td/a/','td/')).strip().replace("', '", '').strip(" ['']")
result1 = str(tree.xpath(expr)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '') if result1 == result2:
result2 = str(tree.xpath(expr.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n') return str(result1).strip('+').replace("', '",'').replace('"','')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','') return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
else:
result = tree.xpath(expr)
try:
return result[index]
except:
return ''

View File

@@ -13,10 +13,10 @@ class Tmdb(Parser):
imagecut = 0 imagecut = 0
apikey = None apikey = None
expr_title = './/head/meta[@property="og:title"]' expr_title = '//head/meta[@property="og:title"]'
expr_release = '//div/span[@class="release"]/text()' expr_release = '//div/span[@class="release"]/text()'
expr_cover = './/head/meta[@property="og:image"]' expr_cover = '//head/meta[@property="og:image"]'
expr_outline = './/head/meta[@property="og:description"]' expr_outline = '//head/meta[@property="og:description"]'
# def search(self, number): # def search(self, number):
# self.detailurl = self.queryNumberUrl(number) # self.detailurl = self.queryNumberUrl(number)