update lib
fix(airav): tags & extrafanart fix(mgstage): clean fix(fanza): outline
This commit is contained in:
@@ -3,7 +3,6 @@
|
||||
import json
|
||||
import re
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
from .parser import Parser
|
||||
from .javbus import Javbus
|
||||
|
||||
@@ -17,12 +16,14 @@ class Airav(Parser):
|
||||
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
|
||||
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
|
||||
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
|
||||
expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
|
||||
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
|
||||
|
||||
def search(self, number):
|
||||
self.number = number
|
||||
self.detailurl = 'https://cn.airav.wiki/video/' + number
|
||||
engine = Javbus()
|
||||
javbusinfo = engine.search(number, self)
|
||||
javbusinfo = engine.scrape(number, self)
|
||||
if javbusinfo == 404:
|
||||
self.javbus = {"title": ""}
|
||||
else:
|
||||
@@ -103,26 +104,8 @@ class Airav(Parser):
|
||||
return result
|
||||
return super().getCover(htmltree)
|
||||
|
||||
def getExtrafanart(self, htmltree):
|
||||
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
|
||||
html = html_pather.search(self.htmlcode)
|
||||
if html:
|
||||
html = html.group()
|
||||
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||
if extrafanart_imgs:
|
||||
return extrafanart_imgs
|
||||
return ''
|
||||
|
||||
def getTags(self, htmltree):
|
||||
tag = []
|
||||
soup = BeautifulSoup(self.htmlcode, 'lxml')
|
||||
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
|
||||
a = x[0].find_all('a')
|
||||
|
||||
for i in a:
|
||||
tag.append(i.get_text())
|
||||
return tag
|
||||
return self.getTreeAll(htmltree, self.expr_tags)
|
||||
|
||||
def getSeries(self, htmltree):
|
||||
result = self.javbus.get('series')
|
||||
|
||||
@@ -11,10 +11,11 @@ class Fanza(Parser):
|
||||
|
||||
expr_title = '//*[starts-with(@id, "title")]/text()'
|
||||
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||
expr_cover = './/head/meta[@property="og:image"]'
|
||||
expr_cover = '//head/meta[@property="og:image"]'
|
||||
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
||||
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
||||
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
||||
expr_outline_og = '//head/meta[@property="og:description"]'
|
||||
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
||||
|
||||
def search(self, number):
|
||||
@@ -73,6 +74,8 @@ class Fanza(Parser):
|
||||
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
|
||||
if result == '':
|
||||
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
||||
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
|
||||
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
|
||||
return result
|
||||
except:
|
||||
return ''
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
from .utils import getTreeElement
|
||||
from .parser import Parser
|
||||
|
||||
|
||||
@@ -12,7 +10,7 @@ class Mgstage(Parser):
|
||||
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
|
||||
expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
|
||||
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
|
||||
expr_outline = '//p/text()'
|
||||
expr_outline = '//dl[@id="introduction"]/dd/p/text()'
|
||||
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
|
||||
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
||||
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
|
||||
@@ -22,60 +20,33 @@ class Mgstage(Parser):
|
||||
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
|
||||
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
|
||||
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
||||
expr_extrafanart = '//a[@class="sample_image"]/@href'
|
||||
|
||||
def search(self, number):
|
||||
self.number = number.upper()
|
||||
self.cookies = {'adc':'1'}
|
||||
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
|
||||
self.htmlcode = self.getHtml(self.detailurl)
|
||||
|
||||
soup = BeautifulSoup(self.htmlcode, 'lxml')
|
||||
self.detailpage = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
||||
self.htmlcodetree = etree.HTML(self.htmlcode)
|
||||
self.detailtree = etree.HTML(self.detailpage)
|
||||
self.introtree = etree.HTML(b2)
|
||||
|
||||
result = self.dictformat(self.detailtree)
|
||||
htmltree =self.getHtmlTree(self.detailurl)
|
||||
result = self.dictformat(htmltree)
|
||||
return result
|
||||
|
||||
def getTitle(self, htmltree):
|
||||
return super().getTitle(self.htmlcodetree).replace('/', ',').replace("\\n",'').replace(' ', '').strip()
|
||||
|
||||
def getOutline(self, htmltree):
|
||||
return super().getOutline(self.introtree).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
||||
|
||||
def getCover(self, htmltree):
|
||||
return super().getCover(self.htmlcodetree)
|
||||
return super().getTitle(htmltree).replace('/', ',').strip()
|
||||
|
||||
def getTags(self, htmltree):
|
||||
result1 = str(self.getTreeAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result2 = str(self.getTreeAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
||||
return result
|
||||
results = self.getTreeAll(htmltree, self.expr_tags)
|
||||
results2 = self.getTreeAll(htmltree, self.expr_tags2)
|
||||
return [ x.strip() for x in (results + results2) if x.strip()]
|
||||
|
||||
def getExtrafanart(self, htmltree):
|
||||
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
|
||||
html = html_pather.search(self.htmlcode)
|
||||
if html:
|
||||
html = html.group()
|
||||
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
|
||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||
if extrafanart_imgs:
|
||||
return extrafanart_imgs
|
||||
return ''
|
||||
def getTreeAll(self, tree, expr):
|
||||
alls = super().getTreeAll(tree, expr)
|
||||
return [ x.strip() for x in alls ]
|
||||
|
||||
def getTreeElement(self, tree, expr, index=0):
|
||||
if expr == '':
|
||||
return ''
|
||||
if tree == self.detailtree:
|
||||
# NOTE: 合并 getMgsString
|
||||
result1 = str(tree.xpath(expr)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
||||
result2 = str(tree.xpath(expr.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
|
||||
result1 = getTreeElement(tree, expr).strip().replace("', '", '').strip(" ['']")
|
||||
result2 = getTreeElement(tree, expr.replace('td/a/','td/')).strip().replace("', '", '').strip(" ['']")
|
||||
if result1 == result2:
|
||||
return str(result1).strip('+').replace("', '",'').replace('"','')
|
||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||
else:
|
||||
result = tree.xpath(expr)
|
||||
try:
|
||||
return result[index]
|
||||
except:
|
||||
return ''
|
||||
|
||||
@@ -13,10 +13,10 @@ class Tmdb(Parser):
|
||||
imagecut = 0
|
||||
apikey = None
|
||||
|
||||
expr_title = './/head/meta[@property="og:title"]'
|
||||
expr_title = '//head/meta[@property="og:title"]'
|
||||
expr_release = '//div/span[@class="release"]/text()'
|
||||
expr_cover = './/head/meta[@property="og:image"]'
|
||||
expr_outline = './/head/meta[@property="og:description"]'
|
||||
expr_cover = '//head/meta[@property="og:image"]'
|
||||
expr_outline = '//head/meta[@property="og:description"]'
|
||||
|
||||
# def search(self, number):
|
||||
# self.detailurl = self.queryNumberUrl(number)
|
||||
|
||||
Reference in New Issue
Block a user