From 0dda035057b2c0ef4c9a02efab120d4130706d95 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Wed, 15 Jun 2022 14:23:49 +0800 Subject: [PATCH] update scrapinglib MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 优化提取extrafanart,trailer等,直接使用xpath expr,不需要正则匹配 - 优化 getchu 获取cover方法,直接使用og标签信息 - 优化 www.getchu 识别 getchu-id 的资源 - 统一获取 tag 方法,返回值 list --- scrapinglib/airav.py | 3 -- scrapinglib/avsox.py | 4 +- scrapinglib/carib.py | 4 -- scrapinglib/dlsite.py | 8 +-- scrapinglib/fanza.py | 4 +- scrapinglib/fc2.py | 12 +---- scrapinglib/gcolle.py | 3 -- scrapinglib/getchu.py | 19 +++---- scrapinglib/jav321.py | 43 +++------------- scrapinglib/javbus.py | 19 ++----- scrapinglib/javdb.py | 60 +++++++--------------- scrapinglib/mgstage.py | 16 +++--- scrapinglib/mv91.py | 3 -- scrapinglib/parser.py | 110 ++++++++++++++++++++--------------------- scrapinglib/utils.py | 8 +-- scrapinglib/xcity.py | 9 +--- 16 files changed, 107 insertions(+), 218 deletions(-) diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py index 1d36805..c22384d 100644 --- a/scrapinglib/airav.py +++ b/scrapinglib/airav.py @@ -104,9 +104,6 @@ class Airav(Parser): return result return super().getCover(htmltree) - def getTags(self, htmltree): - return self.getTreeAll(htmltree, self.expr_tags) - def getSeries(self, htmltree): result = self.javbus.get('series') if isinstance(result, str) and len(result): diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index 3fde11e..c41cb6e 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -53,7 +53,7 @@ class Avsox(Parser): return self.getTreeElement(self.searchtree, self.expr_smallcover) def getTags(self, htmltree): - tags = super().getTags(htmltree).split(',') + tags = self.getTreeElement(htmltree).split(',') return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] def getOutline(self, htmltree): @@ -70,7 +70,7 @@ class Avsox(Parser): return d def getActorPhoto(self, htmltree): - a = super().getActorPhoto(htmltree) + a = self.getTreeAll(htmltree, self.expr_actorphoto) d = {} for i in a: l = i.find('.//img').attrib['src'] diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 9fac553..af99d57 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -47,9 +47,6 @@ class Carib(Parser): def getCover(self, htmltree): return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg' - def getTags(self, htmltree): - return self.getTreeAll(htmltree, self.expr_tags) - def getExtrafanart(self, htmltree): r = [] genres = self.getTreeAll(htmltree, self.expr_extrafanart) @@ -62,7 +59,6 @@ class Carib(Parser): return r def getActorPhoto(self, htmltree): - # return super().getActorPhoto(htmltree) htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") t = {} diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index 25f1203..cc701c9 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -74,9 +74,8 @@ class Dlsite(Parser): def getOutline(self, htmltree): total = [] result = self.getTreeAll(htmltree, self.expr_outline) - for i in result: - total.append(i.strip('\r\n')) - return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") + total = [ x.strip() for x in result if x.strip()] + return '\n'.join(total) def getRelease(self, htmltree): return super().getRelease(htmltree).replace('年','-').replace('月','-').replace('日','') @@ -84,9 +83,6 @@ class Dlsite(Parser): def getCover(self, htmltree): return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg') - def getTags(self, htmltree): - return self.getTreeAll(htmltree, self.expr_tags) - def getExtrafanart(self, htmltree): try: result = [] diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index 1f60d3e..f6cc01c 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -106,13 +106,13 @@ class Fanza(Parser): return self.getFanzaStrings('ジャンル:') def getLabel(self, htmltree): - ret = self.getFanzaStrings('レーベル') + ret = self.getFanzaString('レーベル') if ret == "----": return '' return ret def getSeries(self, htmltree): - ret = self.getFanzaStrings('シリーズ:') + ret = self.getFanzaString('シリーズ:') if ret == "----": return '' return ret diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index c12a1ce..13640ed 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -18,6 +18,7 @@ class Fc2(Parser): expr_director = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' expr_actor = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src" + expr_extrafanart = '//ul[@class="items_article_SampleImagesArea"]/li/a/@href' expr_tags = "//a[@class='tag tagTag']/text()" def search(self, number): @@ -45,17 +46,6 @@ class Fc2(Parser): def getCover(self, htmltree): return urljoin('https://adult.contents.fc2.com', super().getCover(htmltree)) - def getExtrafanart(self, htmltree): - html_pather = re.compile(r'