minor fixes
- fix dlsite discount - fix fanza: cover from og:image and extrafanart xpath expr
This commit is contained in:
@@ -36,25 +36,6 @@ def search(number, sources: str=None, proxies=None, verify=None, type='adult',
|
|||||||
|
|
||||||
class Scraping():
|
class Scraping():
|
||||||
"""
|
"""
|
||||||
|
|
||||||
只爬取内容,不经修改
|
|
||||||
|
|
||||||
如果需要翻译等,再针对此方法封装一层
|
|
||||||
也不做 naming rule 处理
|
|
||||||
|
|
||||||
可以指定刮削库,可查询当前支持的刮削库
|
|
||||||
|
|
||||||
参数:
|
|
||||||
number
|
|
||||||
cookies
|
|
||||||
proxy
|
|
||||||
sources
|
|
||||||
TODO multi threading (加速是否会触发反爬?)
|
|
||||||
|
|
||||||
[x] translate
|
|
||||||
[x] naming rule
|
|
||||||
[x] convert: actress name/tags
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
|
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
|
||||||
|
|||||||
@@ -66,6 +66,8 @@ class Dlsite(Parser):
|
|||||||
result = super().getTitle(htmltree)
|
result = super().getTitle(htmltree)
|
||||||
result = result[:result.rfind(' | DLsite')]
|
result = result[:result.rfind(' | DLsite')]
|
||||||
result = result[:result.rfind(' [')]
|
result = result[:result.rfind(' [')]
|
||||||
|
if 'OFF】' in result:
|
||||||
|
result = result[result.find('】')+1:]
|
||||||
result = result.replace('【HD版】', '')
|
result = result.replace('【HD版】', '')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ class Fanza(Parser):
|
|||||||
|
|
||||||
expr_title = '//*[starts-with(@id, "title")]/text()'
|
expr_title = '//*[starts-with(@id, "title")]/text()'
|
||||||
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||||
|
expr_cover = './/head/meta[@property="og:image"]'
|
||||||
|
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
||||||
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
||||||
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
||||||
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
||||||
@@ -85,9 +87,9 @@ class Fanza(Parser):
|
|||||||
|
|
||||||
def getActors(self, htmltree):
|
def getActors(self, htmltree):
|
||||||
if "anime" not in self.detailurl:
|
if "anime" not in self.detailurl:
|
||||||
return super().getActors(htmltree).replace("', '", ",")
|
return super().getActors(htmltree)
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getRelease(self, htmltree):
|
def getRelease(self, htmltree):
|
||||||
result = self.getFanzaString('発売日:')
|
result = self.getFanzaString('発売日:')
|
||||||
if result == '' or result == '----':
|
if result == '' or result == '----':
|
||||||
@@ -95,43 +97,11 @@ class Fanza(Parser):
|
|||||||
return result.replace("/", "-").strip('\\n')
|
return result.replace("/", "-").strip('\\n')
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
# return super().getCover(htmltree)
|
return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content')
|
||||||
cover_number = self.fanza_hinban
|
|
||||||
try:
|
|
||||||
result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href')
|
|
||||||
except:
|
|
||||||
# sometimes fanza modify _ to \u0005f for image id
|
|
||||||
if "_" in cover_number:
|
|
||||||
cover_number = cover_number.replace("_", r"\u005f")
|
|
||||||
try:
|
|
||||||
result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href')
|
|
||||||
except:
|
|
||||||
# (TODO) handle more edge case
|
|
||||||
# print(html)
|
|
||||||
# raise exception here, same behavior as before
|
|
||||||
# people's major requirement is fetching the picture
|
|
||||||
raise ValueError("can not find image")
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getFanzaStrings('ジャンル:')
|
return self.getFanzaStrings('ジャンル:')
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
|
||||||
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
|
|
||||||
html = html_pather.search(self.htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
s = []
|
|
||||||
for img_url in extrafanart_imgs:
|
|
||||||
img_urls = img_url.rsplit('-', 1)
|
|
||||||
img_url = img_urls[0] + 'jp-' + img_urls[1]
|
|
||||||
s.append(img_url)
|
|
||||||
return s
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getLabel(self, htmltree):
|
def getLabel(self, htmltree):
|
||||||
ret = self.getFanzaStrings('レーベル')
|
ret = self.getFanzaStrings('レーベル')
|
||||||
if ret == "----":
|
if ret == "----":
|
||||||
|
|||||||
Reference in New Issue
Block a user