update lib
fix(airav): tags & extrafanart fix(mgstage): clean fix(fanza): outline
This commit is contained in:
@@ -3,7 +3,6 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
from .javbus import Javbus
|
from .javbus import Javbus
|
||||||
|
|
||||||
@@ -17,12 +16,14 @@ class Airav(Parser):
|
|||||||
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
|
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
|
||||||
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
|
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
|
||||||
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
|
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
|
||||||
|
expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
|
||||||
|
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number
|
self.number = number
|
||||||
self.detailurl = 'https://cn.airav.wiki/video/' + number
|
self.detailurl = 'https://cn.airav.wiki/video/' + number
|
||||||
engine = Javbus()
|
engine = Javbus()
|
||||||
javbusinfo = engine.search(number, self)
|
javbusinfo = engine.scrape(number, self)
|
||||||
if javbusinfo == 404:
|
if javbusinfo == 404:
|
||||||
self.javbus = {"title": ""}
|
self.javbus = {"title": ""}
|
||||||
else:
|
else:
|
||||||
@@ -103,26 +104,8 @@ class Airav(Parser):
|
|||||||
return result
|
return result
|
||||||
return super().getCover(htmltree)
|
return super().getCover(htmltree)
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
|
||||||
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
|
|
||||||
html = html_pather.search(self.htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return extrafanart_imgs
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
tag = []
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
soup = BeautifulSoup(self.htmlcode, 'lxml')
|
|
||||||
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
|
|
||||||
a = x[0].find_all('a')
|
|
||||||
|
|
||||||
for i in a:
|
|
||||||
tag.append(i.get_text())
|
|
||||||
return tag
|
|
||||||
|
|
||||||
def getSeries(self, htmltree):
|
def getSeries(self, htmltree):
|
||||||
result = self.javbus.get('series')
|
result = self.javbus.get('series')
|
||||||
|
|||||||
@@ -11,10 +11,11 @@ class Fanza(Parser):
|
|||||||
|
|
||||||
expr_title = '//*[starts-with(@id, "title")]/text()'
|
expr_title = '//*[starts-with(@id, "title")]/text()'
|
||||||
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||||
expr_cover = './/head/meta[@property="og:image"]'
|
expr_cover = '//head/meta[@property="og:image"]'
|
||||||
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
||||||
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
||||||
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
||||||
|
expr_outline_og = '//head/meta[@property="og:description"]'
|
||||||
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
@@ -73,6 +74,8 @@ class Fanza(Parser):
|
|||||||
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
|
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
|
||||||
if result == '':
|
if result == '':
|
||||||
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
||||||
|
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
|
||||||
|
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
|
||||||
return result
|
return result
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import re
|
from .utils import getTreeElement
|
||||||
from lxml import etree
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
@@ -12,7 +10,7 @@ class Mgstage(Parser):
|
|||||||
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
|
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
|
||||||
expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
|
expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
|
||||||
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
|
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
|
||||||
expr_outline = '//p/text()'
|
expr_outline = '//dl[@id="introduction"]/dd/p/text()'
|
||||||
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
|
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
|
||||||
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
||||||
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
|
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
|
||||||
@@ -22,60 +20,33 @@ class Mgstage(Parser):
|
|||||||
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
|
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
|
||||||
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
|
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
|
||||||
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
||||||
|
expr_extrafanart = '//a[@class="sample_image"]/@href'
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.upper()
|
self.number = number.upper()
|
||||||
self.cookies = {'adc':'1'}
|
self.cookies = {'adc':'1'}
|
||||||
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
|
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
|
||||||
self.htmlcode = self.getHtml(self.detailurl)
|
htmltree =self.getHtmlTree(self.detailurl)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
soup = BeautifulSoup(self.htmlcode, 'lxml')
|
|
||||||
self.detailpage = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
|
||||||
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
|
||||||
self.htmlcodetree = etree.HTML(self.htmlcode)
|
|
||||||
self.detailtree = etree.HTML(self.detailpage)
|
|
||||||
self.introtree = etree.HTML(b2)
|
|
||||||
|
|
||||||
result = self.dictformat(self.detailtree)
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def getTitle(self, htmltree):
|
def getTitle(self, htmltree):
|
||||||
return super().getTitle(self.htmlcodetree).replace('/', ',').replace("\\n",'').replace(' ', '').strip()
|
return super().getTitle(htmltree).replace('/', ',').strip()
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
|
||||||
return super().getOutline(self.introtree).strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
|
||||||
return super().getCover(self.htmlcodetree)
|
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
result1 = str(self.getTreeAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n')
|
results = self.getTreeAll(htmltree, self.expr_tags)
|
||||||
result2 = str(self.getTreeAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n')
|
results2 = self.getTreeAll(htmltree, self.expr_tags2)
|
||||||
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
return [ x.strip() for x in (results + results2) if x.strip()]
|
||||||
return result
|
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getTreeAll(self, tree, expr):
|
||||||
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
|
alls = super().getTreeAll(tree, expr)
|
||||||
html = html_pather.search(self.htmlcode)
|
return [ x.strip() for x in alls ]
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return extrafanart_imgs
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTreeElement(self, tree, expr, index=0):
|
def getTreeElement(self, tree, expr, index=0):
|
||||||
if expr == '':
|
if expr == '':
|
||||||
return ''
|
return ''
|
||||||
if tree == self.detailtree:
|
result1 = getTreeElement(tree, expr).strip().replace("', '", '').strip(" ['']")
|
||||||
# NOTE: 合并 getMgsString
|
result2 = getTreeElement(tree, expr.replace('td/a/','td/')).strip().replace("', '", '').strip(" ['']")
|
||||||
result1 = str(tree.xpath(expr)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
if result1 == result2:
|
||||||
result2 = str(tree.xpath(expr.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
|
return str(result1).strip('+').replace("', '",'').replace('"','')
|
||||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||||
else:
|
|
||||||
result = tree.xpath(expr)
|
|
||||||
try:
|
|
||||||
return result[index]
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ class Tmdb(Parser):
|
|||||||
imagecut = 0
|
imagecut = 0
|
||||||
apikey = None
|
apikey = None
|
||||||
|
|
||||||
expr_title = './/head/meta[@property="og:title"]'
|
expr_title = '//head/meta[@property="og:title"]'
|
||||||
expr_release = '//div/span[@class="release"]/text()'
|
expr_release = '//div/span[@class="release"]/text()'
|
||||||
expr_cover = './/head/meta[@property="og:image"]'
|
expr_cover = '//head/meta[@property="og:image"]'
|
||||||
expr_outline = './/head/meta[@property="og:description"]'
|
expr_outline = '//head/meta[@property="og:description"]'
|
||||||
|
|
||||||
# def search(self, number):
|
# def search(self, number):
|
||||||
# self.detailurl = self.queryNumberUrl(number)
|
# self.detailurl = self.queryNumberUrl(number)
|
||||||
|
|||||||
Reference in New Issue
Block a user