From cd01de134440be056c16a53c13eb6f15839b9cf8 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Mon, 13 Jun 2022 15:15:32 +0800 Subject: [PATCH] minor fixes - fix dlsite discount - fix fanza: cover from og:image and extrafanart xpath expr --- scrapinglib/api.py | 19 ------------------- scrapinglib/dlsite.py | 2 ++ scrapinglib/fanza.py | 40 +++++----------------------------------- 3 files changed, 7 insertions(+), 54 deletions(-) diff --git a/scrapinglib/api.py b/scrapinglib/api.py index ba85acf..c8c4679 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -36,25 +36,6 @@ def search(number, sources: str=None, proxies=None, verify=None, type='adult', class Scraping(): """ - - 只爬取内容,不经修改 - - 如果需要翻译等,再针对此方法封装一层 - 也不做 naming rule 处理 - - 可以指定刮削库,可查询当前支持的刮削库 - - 参数: - number - cookies - proxy - sources - TODO multi threading (加速是否会触发反爬?) - - [x] translate - [x] naming rule - [x] convert: actress name/tags - """ adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index 9443751..25f1203 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -66,6 +66,8 @@ class Dlsite(Parser): result = super().getTitle(htmltree) result = result[:result.rfind(' | DLsite')] result = result[:result.rfind(' [')] + if 'OFF】' in result: + result = result[result.find('】')+1:] result = result.replace('【HD版】', '') return result diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 251d0a4..0ec77d0 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -11,6 +11,8 @@ class Fanza(Parser): expr_title = '//*[starts-with(@id, "title")]/text()' expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" + expr_cover = './/head/meta[@property="og:image"]' + expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" @@ -85,9 +87,9 @@ class Fanza(Parser): def getActors(self, htmltree): if "anime" not in self.detailurl: - return super().getActors(htmltree).replace("', '", ",") + return super().getActors(htmltree) return '' - + def getRelease(self, htmltree): result = self.getFanzaString('発売日:') if result == '' or result == '----': @@ -95,43 +97,11 @@ class Fanza(Parser): return result.replace("/", "-").strip('\\n') def getCover(self, htmltree): - # return super().getCover(htmltree) - cover_number = self.fanza_hinban - try: - result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href') - except: - # sometimes fanza modify _ to \u0005f for image id - if "_" in cover_number: - cover_number = cover_number.replace("_", r"\u005f") - try: - result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href') - except: - # (TODO) handle more edge case - # print(html) - # raise exception here, same behavior as before - # people's major requirement is fetching the picture - raise ValueError("can not find image") - return result + return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content') def getTags(self, htmltree): return self.getFanzaStrings('ジャンル:') - def getExtrafanart(self, htmltree): - html_pather = re.compile(r'
\n') - html = html_pather.search(self.htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'