From 4074dcd366e65469c072609dda5c590247f7073f Mon Sep 17 00:00:00 2001 From: Mathhew Date: Mon, 13 Jun 2022 10:00:41 +0800 Subject: [PATCH] update scrapinglib --- scrapinglib/airav.py | 2 +- scrapinglib/avsox.py | 12 +++++----- scrapinglib/carib.py | 4 ++-- scrapinglib/dlsite.py | 14 ++++++------ scrapinglib/fanza.py | 8 +++---- scrapinglib/gcolle.py | 10 ++++---- scrapinglib/getchu.py | 8 +++---- scrapinglib/jav321.py | 6 ++--- scrapinglib/javbus.py | 12 +++++----- scrapinglib/javdb.py | 48 +++++++++++++++++++------------------- scrapinglib/madou.py | 2 +- scrapinglib/mgstage.py | 6 ++--- scrapinglib/mv91.py | 4 ++-- scrapinglib/parser.py | 52 +++++++++++++++++++++--------------------- scrapinglib/tmdb.py | 6 ++--- scrapinglib/xcity.py | 6 ++--- 16 files changed, 100 insertions(+), 100 deletions(-) diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py index 69f36b0..e1a3ea6 100644 --- a/scrapinglib/airav.py +++ b/scrapinglib/airav.py @@ -69,7 +69,7 @@ class Airav(Parser): return str(re.findall('\d{4}', release)).strip(" ['']") def getOutline(self, htmltree): - return self.getAll(htmltree, self.expr_outline).replace('\n','').strip() + return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip() def getRuntime(self, htmltree): result = self.javbus.get('runtime') diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index ffb6723..3fde11e 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -23,19 +23,19 @@ class Avsox(Parser): def queryNumberUrl(self, number): qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox') - site = self.getTreeIndex(qurySiteTree, '//div[@class="container"]/div/a/@href') + site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href') self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) - result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') if result1 == '' or result1 == 'null' or result1 == 'None': self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_')) - result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') if result1 == '' or result1 == 'null' or result1 == 'None': self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', '')) - result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href') return "https:" + result1 def getNum(self, htmltree): - new_number = self.getTreeIndex(htmltree, self.expr_number) + new_number = self.getTreeElement(htmltree, self.expr_number) if new_number.upper() != self.number.upper(): raise ValueError('number not found in ' + self.source) self.number = new_number @@ -50,7 +50,7 @@ class Avsox(Parser): def getSmallCover(self, htmltree): """ 使用搜索页面的预览小图 """ - return self.getTreeIndex(self.searchtree, self.expr_smallcover) + return self.getTreeElement(self.searchtree, self.expr_smallcover) def getTags(self, htmltree): tags = super().getTags(htmltree).split(',') diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 415006d..9fac553 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -48,11 +48,11 @@ class Carib(Parser): return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg' def getTags(self, htmltree): - return self.getAll(htmltree, self.expr_tags) + return self.getTreeAll(htmltree, self.expr_tags) def getExtrafanart(self, htmltree): r = [] - genres = self.getAll(htmltree, self.expr_extrafanart) + genres = self.getTreeAll(htmltree, self.expr_extrafanart) for g in genres: jpg = str(g) if '/member/' in jpg: diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index 125afe2..9443751 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -36,22 +36,22 @@ class Dlsite(Parser): else: self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie' htmltree = self.getHtmlTree(self.detailurl) - search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') if len(search_result) == 0: number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","") htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') - search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') if len(search_result) == 0: if "~" in number: number = number.replace("~","〜") elif "〜" in number: number = number.replace("〜","~") htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') - search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') if len(search_result) == 0: number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '') htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') - search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') self.detailurl = search_result[0] htmltree = self.getHtmlTree(self.detailurl) self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']") @@ -71,7 +71,7 @@ class Dlsite(Parser): def getOutline(self, htmltree): total = [] - result = self.getAll(htmltree, self.expr_outline) + result = self.getTreeAll(htmltree, self.expr_outline) for i in result: total.append(i.strip('\r\n')) return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") @@ -83,12 +83,12 @@ class Dlsite(Parser): return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg') def getTags(self, htmltree): - return self.getAll(htmltree, self.expr_tags) + return self.getTreeAll(htmltree, self.expr_tags) def getExtrafanart(self, htmltree): try: result = [] - for i in self.getAll(self.expr_extrafanart): + for i in self.getTreeAll(self.expr_extrafanart): result.append("https:" + i) except: result = '' diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index b3e5824..251d0a4 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -68,9 +68,9 @@ class Fanza(Parser): def getOutline(self, htmltree): try: - result = self.getTreeIndex(htmltree, self.expr_outline).replace("\n", "") + result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "") if result == '': - result = self.getTreeIndex(htmltree, self.expr_outline2).replace("\n", "") + result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "") return result except: return '' @@ -98,13 +98,13 @@ class Fanza(Parser): # return super().getCover(htmltree) cover_number = self.fanza_hinban try: - result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href') + result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href') except: # sometimes fanza modify _ to \u0005f for image id if "_" in cover_number: cover_number = cover_number.replace("_", r"\u005f") try: - result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href') + result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href') except: # (TODO) handle more edge case # print(html) diff --git a/scrapinglib/gcolle.py b/scrapinglib/gcolle.py index 93fc58b..14a0e17 100644 --- a/scrapinglib/gcolle.py +++ b/scrapinglib/gcolle.py @@ -32,7 +32,7 @@ class Gcolle(Parser): htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text htmltree = etree.HTML(htmlcode) - r18url = self.getTreeIndex(htmltree, self.expr_r18) + r18url = self.getTreeElement(htmltree, self.expr_r18) if r18url and r18url.startswith('http'): htmlcode = session.get(r18url).text htmltree = etree.HTML(htmlcode) @@ -46,7 +46,7 @@ class Gcolle(Parser): return "GCOLLE-" + str(num) def getOutline(self, htmltree): - result = self.getAll(htmltree, self.expr_outline) + result = self.getTreeAll(htmltree, self.expr_outline) try: return "\n".join(result) except: @@ -59,12 +59,12 @@ class Gcolle(Parser): return "https:" + super().getCover(htmltree) def getTags(self, htmltree): - return self.getAll(htmltree, self.expr_tags) + return self.getTreeAll(htmltree, self.expr_tags) def getExtrafanart(self, htmltree): - extrafanart = self.getAll(htmltree, self.expr_extrafanart) + extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart) if len(extrafanart) == 0: - extrafanart = self.getAll(htmltree, self.expr_extrafanart2) + extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart2) # Add "https:" in each extrafanart url for i in range(len(extrafanart)): extrafanart[i] = 'https:' + extrafanart[i] diff --git a/scrapinglib/getchu.py b/scrapinglib/getchu.py index 5261ee6..b29f4d8 100644 --- a/scrapinglib/getchu.py +++ b/scrapinglib/getchu.py @@ -53,7 +53,7 @@ class wwwGetchu(Parser): retry = 2 for i in range(retry): queryTree = self.getHtmlTree(queryUrl) - detailurl = self.getTreeIndex(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href') + detailurl = self.getTreeElement(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href') if detailurl: break if detailurl == "": @@ -70,11 +70,11 @@ class wwwGetchu(Parser): return super().getDirector(htmltree) def getTags(self, htmltree): - return self.getAll(htmltree, self.expr_tags) + return self.getTreeAll(htmltree, self.expr_tags) def getOutline(self, htmltree): outline = '' - _list = self.getAll(htmltree, self.expr_outline) + _list = self.getTreeAll(htmltree, self.expr_outline) for i in _list: outline = outline + i.strip() return outline @@ -116,7 +116,7 @@ class dlGetchu(wwwGetchu): else: queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number) queryTree = self.getHtmlTree(queryUrl) - detailurl = self.getTreeIndex(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href') + detailurl = self.getTreeElement(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href') if detailurl == "": return None self.number = re.findall('\d+', detailurl)[0] diff --git a/scrapinglib/jav321.py b/scrapinglib/jav321.py index b202aa6..c0d1b23 100644 --- a/scrapinglib/jav321.py +++ b/scrapinglib/jav321.py @@ -74,10 +74,10 @@ class Jav321(Parser): return self.parseElement(super().getActors(htmltree)) def getLabel(self, htmltree): - return self.parseElement(self.getAll(htmltree, self.expr_label)) + return self.parseElement(self.getTreeAll(htmltree, self.expr_label)) def getTags(self, htmltree): - return self.parseElement(self.getAll(htmltree, self.expr_tags)) + return self.parseElement(self.getTreeAll(htmltree, self.expr_tags)) def getStudio(self, htmltree): - return self.parseElement(self.getAll(htmltree, self.expr_studio)) + return self.parseElement(self.getTreeAll(htmltree, self.expr_studio)) diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index 2646c72..1438c9b 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -78,9 +78,9 @@ class Javbus(Parser): def getStudio(self, htmltree): if self.uncensored: - return self.getTreeIndex(htmltree, self.expr_studio2) + return self.getTreeElement(htmltree, self.expr_studio2) else: - return self.getTreeIndex(htmltree, self.expr_studio) + return self.getTreeElement(htmltree, self.expr_studio) def getCover(self, htmltree): return urljoin("https://www.javbus.com", super().getCover(htmltree)) @@ -111,15 +111,15 @@ class Javbus(Parser): def getDirector(self, htmltree): if self.uncensored: - return self.getTreeIndex(htmltree, self.expr_directorJa) + return self.getTreeElement(htmltree, self.expr_directorJa) else: - return self.getTreeIndex(htmltree, self.expr_director) + return self.getTreeElement(htmltree, self.expr_director) def getSeries(self, htmltree): if self.uncensored: - return self.getTreeIndex(htmltree, self.expr_series2) + return self.getTreeElement(htmltree, self.expr_series2) else: - return self.getTreeIndex(htmltree, self.expr_series) + return self.getTreeElement(htmltree, self.expr_series) def getTags(self, htmltree): tags = super().getTags(htmltree).split(',') diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index 839c166..178adb2 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -86,12 +86,12 @@ class Javdb(Parser): # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for # iterate all candidates and find the match one - urls = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href') + urls = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href') # 记录一下欧美的ids ['Blacked','Blacked'] if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): correct_url = urls[0] else: - ids = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()') + ids = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()') try: self.queryid = ids.index(number) correct_url = urls[self.queryid] @@ -105,8 +105,8 @@ class Javdb(Parser): def getNum(self, htmltree): if self.noauth: return self.number - result1 = str(self.getAll(htmltree, self.expr_number)).strip(" ['']") - result2 = str(self.getAll(htmltree, self.expr_number2)).strip(" ['']") + result1 = str(self.getTreeAll(htmltree, self.expr_number)).strip(" ['']") + result2 = str(self.getTreeAll(htmltree, self.expr_number2)).strip(" ['']") dp_number = str(result2 + result1).strip('+') # NOTE 检测匹配与更新 self.number if dp_number.upper() != self.number.upper(): @@ -116,50 +116,50 @@ class Javdb(Parser): def getTitle(self, htmltree): if self.noauth: - return self.getTreeIndex(htmltree, self.expr_title_no, self.queryid) + return self.getTreeElement(htmltree, self.expr_title_no, self.queryid) browser_title = super().getTitle(htmltree) title = browser_title[:browser_title.find(' | JavDB')].strip() return title.replace(self.number, '').strip() def getCover(self, htmltree): if self.noauth: - return self.getTreeIndex(htmltree, self.expr_cover_no, self.queryid) + return self.getTreeElement(htmltree, self.expr_cover_no, self.queryid) return super().getCover(htmltree) def getRelease(self, htmltree): if self.noauth: - return self.getTreeIndex(htmltree, self.expr_release_no, self.queryid).strip() + return self.getTreeElement(htmltree, self.expr_release_no, self.queryid).strip() return super().getRelease(htmltree) def getRuntime(self, htmltree): - result1 = str(self.getAll(htmltree, self.expr_runtime)).strip(" ['']") - result2 = str(self.getAll(htmltree, self.expr_runtime2)).strip(" ['']") + result1 = str(self.getTreeAll(htmltree, self.expr_runtime)).strip(" ['']") + result2 = str(self.getTreeAll(htmltree, self.expr_runtime2)).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') def getDirector(self, htmltree): - result1 = str(self.getAll(htmltree, self.expr_director)).strip(" ['']") - result2 = str(self.getAll(htmltree, self.expr_director2)).strip(" ['']") + result1 = str(self.getTreeAll(htmltree, self.expr_director)).strip(" ['']") + result2 = str(self.getTreeAll(htmltree, self.expr_director2)).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def getSeries(self, htmltree): - result1 = str(self.getAll(htmltree, self.expr_series)).strip(" ['']") - result2 = str(self.getAll(htmltree, self.expr_series2)).strip(" ['']") + result1 = str(self.getTreeAll(htmltree, self.expr_series)).strip(" ['']") + result2 = str(self.getTreeAll(htmltree, self.expr_series2)).strip(" ['']") result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '') if not result and self.fixstudio: result = self.getStudio(htmltree) return result def getLabel(self, htmltree): - result1 = str(self.getAll(htmltree, self.expr_label)).strip(" ['']") - result2 = str(self.getAll(htmltree, self.expr_label2)).strip(" ['']") + result1 = str(self.getTreeAll(htmltree, self.expr_label)).strip(" ['']") + result2 = str(self.getTreeAll(htmltree, self.expr_label2)).strip(" ['']") result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '') if not result and self.fixstudio: result = self.getStudio(htmltree) return result def getActors(self, htmltree): - actors = self.getAll(htmltree, self.expr_actor) - genders = self.getAll(htmltree, self.expr_actor2) + actors = self.getTreeAll(htmltree, self.expr_actor) + genders = self.getTreeAll(htmltree, self.expr_actor2) r = [] idx = 0 # NOTE only female, we dont care others @@ -184,11 +184,11 @@ class Javdb(Parser): def getStudio(self, htmltree): try: - return self.getAll(htmltree, self.expr_studio).strip(" ['']") + return self.getTreeAll(htmltree, self.expr_studio).strip(" ['']") except: pass try: - return self.getAll(htmltree, self.expr_studio2).strip(" ['']") + return self.getTreeAll(htmltree, self.expr_studio2).strip(" ['']") except: return '' @@ -207,17 +207,17 @@ class Javdb(Parser): def getTags(self, htmltree): try: - return self.getAll(htmltree, self.expr_tags) + return self.getTreeAll(htmltree, self.expr_tags) except: pass try: - return self.getAll(htmltree, self.expr_tags2) + return self.getTreeAll(htmltree, self.expr_tags2) except: return '' def getUserRating(self, htmltree): try: - result = str(self.getTreeIndex(htmltree, self.expr_userrating)) + result = str(self.getTreeElement(htmltree, self.expr_userrating)) v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) return float(v[0][0]) except: @@ -225,7 +225,7 @@ class Javdb(Parser): def getUserVotes(self, htmltree): try: - result = str(self.getTreeIndex(htmltree, self.expr_uservotes)) + result = str(self.getTreeElement(htmltree, self.expr_uservotes)) v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) return int(v[0][1]) except: @@ -237,7 +237,7 @@ class Javdb(Parser): return img_url[0] if img_url else '' def getActorPhoto(self, htmltree): - actorall = self.getAll(htmltree, self.expr_actorphoto) + actorall = self.getTreeAll(htmltree, self.expr_actorphoto) if not actorall: return {} actors = self.getActors(htmltree) diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index d300b4a..d15a2ee 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -22,7 +22,7 @@ class Madou(Parser): if self.htmlcode == 404: return 404 htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) - self.detailurl = self.getTreeIndex(htmltree, self.expr_url) + self.detailurl = self.getTreeElement(htmltree, self.expr_url) result = self.dictformat(htmltree) return result diff --git a/scrapinglib/mgstage.py b/scrapinglib/mgstage.py index c4c229e..7e41b3f 100644 --- a/scrapinglib/mgstage.py +++ b/scrapinglib/mgstage.py @@ -49,8 +49,8 @@ class Mgstage(Parser): return super().getCover(self.htmlcodetree) def getTags(self, htmltree): - result1 = str(self.getAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n') - result2 = str(self.getAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n') + result1 = str(self.getTreeAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n') + result2 = str(self.getTreeAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n') result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') return result @@ -65,7 +65,7 @@ class Mgstage(Parser): return extrafanart_imgs return '' - def getTreeIndex(self, tree, expr, index=0): + def getTreeElement(self, tree, expr, index=0): if expr == '': return '' if tree == self.detailtree: diff --git a/scrapinglib/mv91.py b/scrapinglib/mv91.py index 114aced..18814f4 100644 --- a/scrapinglib/mv91.py +++ b/scrapinglib/mv91.py @@ -62,11 +62,11 @@ class Mv91(Parser): return '91制片厂' def getTags(self, htmltree): - return self.getAll(htmltree, self.expr_tags) + return self.getTreeAll(htmltree, self.expr_tags) def getActors(self, htmltree): b=[] - for player in self.getAll(htmltree, self.expr_actor): + for player in self.getTreeAll(htmltree, self.expr_actor): player = player.replace('主演:','') if '/' in player: player = player.split('/')[0] diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index 14493d8..c052836 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -144,18 +144,18 @@ class Parser: def getNum(self, htmltree): """ 增加 strip 过滤 """ - return self.getTreeIndex(htmltree, self.expr_number) + return self.getTreeElement(htmltree, self.expr_number) def getTitle(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_title).strip() + return self.getTreeElement(htmltree, self.expr_title).strip() def getStudio(self, htmltree): try: - return self.getTreeIndex(htmltree, self.expr_studio).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_studio).strip(" ['']") except: pass try: - return self.getTreeIndex(htmltree, self.expr_studio2).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_studio2).strip(" ['']") except: return '' @@ -170,90 +170,90 @@ class Parser: def getRuntime(self, htmltree): try: - return self.getTreeIndex(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi') + return self.getTreeElement(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi') except: pass try: - return self.getTreeIndex(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi') + return self.getTreeElement(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi') except: return '' def getRelease(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_release).strip().replace('/','-') + return self.getTreeElement(htmltree, self.expr_release).strip().replace('/','-') def getOutline(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_outline).strip().replace("\n","") + return self.getTreeElement(htmltree, self.expr_outline).strip().replace("\n","") def getDirector(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_director) + return self.getTreeElement(htmltree, self.expr_director) def getActors(self, htmltree): - return self.getAll(htmltree, self.expr_actor) + return self.getTreeAll(htmltree, self.expr_actor) def getTags(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_tags) + return self.getTreeElement(htmltree, self.expr_tags) def getLabel(self, htmltree): try: - return self.getTreeIndex(htmltree, self.expr_label).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_label).strip(" ['']") except: pass try: - return self.getTreeIndex(htmltree, self.expr_label2).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_label2).strip(" ['']") except: return '' def getSeries(self, htmltree): try: - return self.getTreeIndex(htmltree, self.expr_series).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_series).strip(" ['']") except: pass try: - return self.getTreeIndex(htmltree, self.expr_series2).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_series2).strip(" ['']") except: return '' def getCover(self, htmltree): try: - return self.getTreeIndex(htmltree, self.expr_cover).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_cover).strip(" ['']") except: pass try: - return self.getTreeIndex(htmltree, self.expr_cover2).strip(" ['']") + return self.getTreeElement(htmltree, self.expr_cover2).strip(" ['']") except: return '' def getSmallCover(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_smallcover) + return self.getTreeElement(htmltree, self.expr_smallcover) def getExtrafanart(self, htmltree): - return self.getAll(htmltree, self.expr_extrafanart) + return self.getTreeAll(htmltree, self.expr_extrafanart) def getTrailer(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_trailer) + return self.getTreeElement(htmltree, self.expr_trailer) def getActorPhoto(self, htmltree): - return self.getAll(htmltree, self.expr_actorphoto) + return self.getTreeAll(htmltree, self.expr_actorphoto) def getUncensored(self, htmlree): if self.expr_uncensored: - u = self.getAll(htmlree, self.expr_uncensored) + u = self.getTreeAll(htmlree, self.expr_uncensored) return bool(u) else: return self.uncensored def getUserRating(self, htmltree): - return self.getAll(htmltree, self.expr_userrating) + return self.getTreeAll(htmltree, self.expr_userrating) def getUserVotes(self, htmltree): - return self.getAll(htmltree, self.expr_uservotes) + return self.getTreeAll(htmltree, self.expr_uservotes) - def getTreeIndex(self, tree: html.HtmlElement, expr, index=0): + def getTreeElement(self, tree: html.HtmlElement, expr, index=0): """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 """ return getTreeElement(tree, expr, index) - def getAll(self, tree: html.HtmlElement, expr): + def getTreeAll(self, tree: html.HtmlElement, expr): """ 根据表达式从`xmltree`中获取全部匹配值 """ return getTreeAll(tree, expr) diff --git a/scrapinglib/tmdb.py b/scrapinglib/tmdb.py index ccc8a36..e6adbbd 100644 --- a/scrapinglib/tmdb.py +++ b/scrapinglib/tmdb.py @@ -31,10 +31,10 @@ class Tmdb(Parser): return movieUrl def getTitle(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_title).get('content') + return self.getTreeElement(htmltree, self.expr_title).get('content') def getCover(self, htmltree): - return "https://www.themoviedb.org" + self.getTreeIndex(htmltree, self.expr_cover).get('content') + return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content') def getOutline(self, htmltree): - return self.getTreeIndex(htmltree, self.expr_outline).get('content') + return self.getTreeElement(htmltree, self.expr_outline).get('content') diff --git a/scrapinglib/xcity.py b/scrapinglib/xcity.py index d787e54..24658ce 100644 --- a/scrapinglib/xcity.py +++ b/scrapinglib/xcity.py @@ -28,17 +28,17 @@ class Xcity(Parser): return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') def getRuntime(self, htmltree): - return self.getAll(htmltree, self.expr_runtime)[1].strip() + return self.getTreeAll(htmltree, self.expr_runtime)[1].strip() def getRelease(self, htmltree): try: - result = self.getTreeIndex(htmltree, self.expr_release, 1) + result = self.getTreeElement(htmltree, self.expr_release, 1) return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') except: return '' def getTags(self, htmltree): - result = self.getAll(htmltree, self.expr_tags) + result = self.getTreeAll(htmltree, self.expr_tags) total = [] for i in result: total.append(i.replace("\n","").replace("\t",""))