update scrapinglib

This commit is contained in:
Mathhew
2022-06-13 10:00:41 +08:00
parent 8348fa167b
commit 4074dcd366
16 changed files with 100 additions and 100 deletions

View File

@@ -69,7 +69,7 @@ class Airav(Parser):
return str(re.findall('\d{4}', release)).strip(" ['']") return str(re.findall('\d{4}', release)).strip(" ['']")
def getOutline(self, htmltree): def getOutline(self, htmltree):
return self.getAll(htmltree, self.expr_outline).replace('\n','').strip() return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
def getRuntime(self, htmltree): def getRuntime(self, htmltree):
result = self.javbus.get('runtime') result = self.javbus.get('runtime')

View File

@@ -23,19 +23,19 @@ class Avsox(Parser):
def queryNumberUrl(self, number): def queryNumberUrl(self, number):
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox') qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
site = self.getTreeIndex(qurySiteTree, '//div[@class="container"]/div/a/@href') site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_')) self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None': if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', '')) self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
return "https:" + result1 return "https:" + result1
def getNum(self, htmltree): def getNum(self, htmltree):
new_number = self.getTreeIndex(htmltree, self.expr_number) new_number = self.getTreeElement(htmltree, self.expr_number)
if new_number.upper() != self.number.upper(): if new_number.upper() != self.number.upper():
raise ValueError('number not found in ' + self.source) raise ValueError('number not found in ' + self.source)
self.number = new_number self.number = new_number
@@ -50,7 +50,7 @@ class Avsox(Parser):
def getSmallCover(self, htmltree): def getSmallCover(self, htmltree):
""" 使用搜索页面的预览小图 """ 使用搜索页面的预览小图
""" """
return self.getTreeIndex(self.searchtree, self.expr_smallcover) return self.getTreeElement(self.searchtree, self.expr_smallcover)
def getTags(self, htmltree): def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',') tags = super().getTags(htmltree).split(',')

View File

@@ -48,11 +48,11 @@ class Carib(Parser):
return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg' return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg'
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags) return self.getTreeAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree): def getExtrafanart(self, htmltree):
r = [] r = []
genres = self.getAll(htmltree, self.expr_extrafanart) genres = self.getTreeAll(htmltree, self.expr_extrafanart)
for g in genres: for g in genres:
jpg = str(g) jpg = str(g)
if '/member/' in jpg: if '/member/' in jpg:

View File

@@ -36,22 +36,22 @@ class Dlsite(Parser):
else: else:
self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie' self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie'
htmltree = self.getHtmlTree(self.detailurl) htmltree = self.getHtmlTree(self.detailurl)
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0: if len(search_result) == 0:
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","") number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0: if len(search_result) == 0:
if "" in number: if "" in number:
number = number.replace("","") number = number.replace("","")
elif "" in number: elif "" in number:
number = number.replace("","") number = number.replace("","")
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0: if len(search_result) == 0:
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '') number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
self.detailurl = search_result[0] self.detailurl = search_result[0]
htmltree = self.getHtmlTree(self.detailurl) htmltree = self.getHtmlTree(self.detailurl)
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']") self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
@@ -71,7 +71,7 @@ class Dlsite(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
total = [] total = []
result = self.getAll(htmltree, self.expr_outline) result = self.getTreeAll(htmltree, self.expr_outline)
for i in result: for i in result:
total.append(i.strip('\r\n')) total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
@@ -83,12 +83,12 @@ class Dlsite(Parser):
return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg') return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg')
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags) return self.getTreeAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree): def getExtrafanart(self, htmltree):
try: try:
result = [] result = []
for i in self.getAll(self.expr_extrafanart): for i in self.getTreeAll(self.expr_extrafanart):
result.append("https:" + i) result.append("https:" + i)
except: except:
result = '' result = ''

View File

@@ -68,9 +68,9 @@ class Fanza(Parser):
def getOutline(self, htmltree): def getOutline(self, htmltree):
try: try:
result = self.getTreeIndex(htmltree, self.expr_outline).replace("\n", "") result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
if result == '': if result == '':
result = self.getTreeIndex(htmltree, self.expr_outline2).replace("\n", "") result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
return result return result
except: except:
return '' return ''
@@ -98,13 +98,13 @@ class Fanza(Parser):
# return super().getCover(htmltree) # return super().getCover(htmltree)
cover_number = self.fanza_hinban cover_number = self.fanza_hinban
try: try:
result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href') result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href')
except: except:
# sometimes fanza modify _ to \u0005f for image id # sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number: if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f") cover_number = cover_number.replace("_", r"\u005f")
try: try:
result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href') result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href')
except: except:
# (TODO) handle more edge case # (TODO) handle more edge case
# print(html) # print(html)

View File

@@ -32,7 +32,7 @@ class Gcolle(Parser):
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
htmltree = etree.HTML(htmlcode) htmltree = etree.HTML(htmlcode)
r18url = self.getTreeIndex(htmltree, self.expr_r18) r18url = self.getTreeElement(htmltree, self.expr_r18)
if r18url and r18url.startswith('http'): if r18url and r18url.startswith('http'):
htmlcode = session.get(r18url).text htmlcode = session.get(r18url).text
htmltree = etree.HTML(htmlcode) htmltree = etree.HTML(htmlcode)
@@ -46,7 +46,7 @@ class Gcolle(Parser):
return "GCOLLE-" + str(num) return "GCOLLE-" + str(num)
def getOutline(self, htmltree): def getOutline(self, htmltree):
result = self.getAll(htmltree, self.expr_outline) result = self.getTreeAll(htmltree, self.expr_outline)
try: try:
return "\n".join(result) return "\n".join(result)
except: except:
@@ -59,12 +59,12 @@ class Gcolle(Parser):
return "https:" + super().getCover(htmltree) return "https:" + super().getCover(htmltree)
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags) return self.getTreeAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree): def getExtrafanart(self, htmltree):
extrafanart = self.getAll(htmltree, self.expr_extrafanart) extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart)
if len(extrafanart) == 0: if len(extrafanart) == 0:
extrafanart = self.getAll(htmltree, self.expr_extrafanart2) extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart2)
# Add "https:" in each extrafanart url # Add "https:" in each extrafanart url
for i in range(len(extrafanart)): for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i] extrafanart[i] = 'https:' + extrafanart[i]

View File

@@ -53,7 +53,7 @@ class wwwGetchu(Parser):
retry = 2 retry = 2
for i in range(retry): for i in range(retry):
queryTree = self.getHtmlTree(queryUrl) queryTree = self.getHtmlTree(queryUrl)
detailurl = self.getTreeIndex(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href') detailurl = self.getTreeElement(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if detailurl: if detailurl:
break break
if detailurl == "": if detailurl == "":
@@ -70,11 +70,11 @@ class wwwGetchu(Parser):
return super().getDirector(htmltree) return super().getDirector(htmltree)
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags) return self.getTreeAll(htmltree, self.expr_tags)
def getOutline(self, htmltree): def getOutline(self, htmltree):
outline = '' outline = ''
_list = self.getAll(htmltree, self.expr_outline) _list = self.getTreeAll(htmltree, self.expr_outline)
for i in _list: for i in _list:
outline = outline + i.strip() outline = outline + i.strip()
return outline return outline
@@ -116,7 +116,7 @@ class dlGetchu(wwwGetchu):
else: else:
queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number) queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number)
queryTree = self.getHtmlTree(queryUrl) queryTree = self.getHtmlTree(queryUrl)
detailurl = self.getTreeIndex(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href') detailurl = self.getTreeElement(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
if detailurl == "": if detailurl == "":
return None return None
self.number = re.findall('\d+', detailurl)[0] self.number = re.findall('\d+', detailurl)[0]

View File

@@ -74,10 +74,10 @@ class Jav321(Parser):
return self.parseElement(super().getActors(htmltree)) return self.parseElement(super().getActors(htmltree))
def getLabel(self, htmltree): def getLabel(self, htmltree):
return self.parseElement(self.getAll(htmltree, self.expr_label)) return self.parseElement(self.getTreeAll(htmltree, self.expr_label))
def getTags(self, htmltree): def getTags(self, htmltree):
return self.parseElement(self.getAll(htmltree, self.expr_tags)) return self.parseElement(self.getTreeAll(htmltree, self.expr_tags))
def getStudio(self, htmltree): def getStudio(self, htmltree):
return self.parseElement(self.getAll(htmltree, self.expr_studio)) return self.parseElement(self.getTreeAll(htmltree, self.expr_studio))

View File

@@ -78,9 +78,9 @@ class Javbus(Parser):
def getStudio(self, htmltree): def getStudio(self, htmltree):
if self.uncensored: if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_studio2) return self.getTreeElement(htmltree, self.expr_studio2)
else: else:
return self.getTreeIndex(htmltree, self.expr_studio) return self.getTreeElement(htmltree, self.expr_studio)
def getCover(self, htmltree): def getCover(self, htmltree):
return urljoin("https://www.javbus.com", super().getCover(htmltree)) return urljoin("https://www.javbus.com", super().getCover(htmltree))
@@ -111,15 +111,15 @@ class Javbus(Parser):
def getDirector(self, htmltree): def getDirector(self, htmltree):
if self.uncensored: if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_directorJa) return self.getTreeElement(htmltree, self.expr_directorJa)
else: else:
return self.getTreeIndex(htmltree, self.expr_director) return self.getTreeElement(htmltree, self.expr_director)
def getSeries(self, htmltree): def getSeries(self, htmltree):
if self.uncensored: if self.uncensored:
return self.getTreeIndex(htmltree, self.expr_series2) return self.getTreeElement(htmltree, self.expr_series2)
else: else:
return self.getTreeIndex(htmltree, self.expr_series) return self.getTreeElement(htmltree, self.expr_series)
def getTags(self, htmltree): def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',') tags = super().getTags(htmltree).split(',')

View File

@@ -86,12 +86,12 @@ class Javdb(Parser):
# javdb sometime returns multiple results, # javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for # and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one # iterate all candidates and find the match one
urls = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href') urls = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href')
# 记录一下欧美的ids ['Blacked','Blacked'] # 记录一下欧美的ids ['Blacked','Blacked']
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
correct_url = urls[0] correct_url = urls[0]
else: else:
ids = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()') ids = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
try: try:
self.queryid = ids.index(number) self.queryid = ids.index(number)
correct_url = urls[self.queryid] correct_url = urls[self.queryid]
@@ -105,8 +105,8 @@ class Javdb(Parser):
def getNum(self, htmltree): def getNum(self, htmltree):
if self.noauth: if self.noauth:
return self.number return self.number
result1 = str(self.getAll(htmltree, self.expr_number)).strip(" ['']") result1 = str(self.getTreeAll(htmltree, self.expr_number)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_number2)).strip(" ['']") result2 = str(self.getTreeAll(htmltree, self.expr_number2)).strip(" ['']")
dp_number = str(result2 + result1).strip('+') dp_number = str(result2 + result1).strip('+')
# NOTE 检测匹配与更新 self.number # NOTE 检测匹配与更新 self.number
if dp_number.upper() != self.number.upper(): if dp_number.upper() != self.number.upper():
@@ -116,50 +116,50 @@ class Javdb(Parser):
def getTitle(self, htmltree): def getTitle(self, htmltree):
if self.noauth: if self.noauth:
return self.getTreeIndex(htmltree, self.expr_title_no, self.queryid) return self.getTreeElement(htmltree, self.expr_title_no, self.queryid)
browser_title = super().getTitle(htmltree) browser_title = super().getTitle(htmltree)
title = browser_title[:browser_title.find(' | JavDB')].strip() title = browser_title[:browser_title.find(' | JavDB')].strip()
return title.replace(self.number, '').strip() return title.replace(self.number, '').strip()
def getCover(self, htmltree): def getCover(self, htmltree):
if self.noauth: if self.noauth:
return self.getTreeIndex(htmltree, self.expr_cover_no, self.queryid) return self.getTreeElement(htmltree, self.expr_cover_no, self.queryid)
return super().getCover(htmltree) return super().getCover(htmltree)
def getRelease(self, htmltree): def getRelease(self, htmltree):
if self.noauth: if self.noauth:
return self.getTreeIndex(htmltree, self.expr_release_no, self.queryid).strip() return self.getTreeElement(htmltree, self.expr_release_no, self.queryid).strip()
return super().getRelease(htmltree) return super().getRelease(htmltree)
def getRuntime(self, htmltree): def getRuntime(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_runtime)).strip(" ['']") result1 = str(self.getTreeAll(htmltree, self.expr_runtime)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_runtime2)).strip(" ['']") result2 = str(self.getTreeAll(htmltree, self.expr_runtime2)).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi') return str(result1 + result2).strip('+').rstrip('mi')
def getDirector(self, htmltree): def getDirector(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_director)).strip(" ['']") result1 = str(self.getTreeAll(htmltree, self.expr_director)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_director2)).strip(" ['']") result2 = str(self.getTreeAll(htmltree, self.expr_director2)).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getSeries(self, htmltree): def getSeries(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_series)).strip(" ['']") result1 = str(self.getTreeAll(htmltree, self.expr_series)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_series2)).strip(" ['']") result2 = str(self.getTreeAll(htmltree, self.expr_series2)).strip(" ['']")
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '') result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
if not result and self.fixstudio: if not result and self.fixstudio:
result = self.getStudio(htmltree) result = self.getStudio(htmltree)
return result return result
def getLabel(self, htmltree): def getLabel(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_label)).strip(" ['']") result1 = str(self.getTreeAll(htmltree, self.expr_label)).strip(" ['']")
result2 = str(self.getAll(htmltree, self.expr_label2)).strip(" ['']") result2 = str(self.getTreeAll(htmltree, self.expr_label2)).strip(" ['']")
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '') result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
if not result and self.fixstudio: if not result and self.fixstudio:
result = self.getStudio(htmltree) result = self.getStudio(htmltree)
return result return result
def getActors(self, htmltree): def getActors(self, htmltree):
actors = self.getAll(htmltree, self.expr_actor) actors = self.getTreeAll(htmltree, self.expr_actor)
genders = self.getAll(htmltree, self.expr_actor2) genders = self.getTreeAll(htmltree, self.expr_actor2)
r = [] r = []
idx = 0 idx = 0
# NOTE only female, we dont care others # NOTE only female, we dont care others
@@ -184,11 +184,11 @@ class Javdb(Parser):
def getStudio(self, htmltree): def getStudio(self, htmltree):
try: try:
return self.getAll(htmltree, self.expr_studio).strip(" ['']") return self.getTreeAll(htmltree, self.expr_studio).strip(" ['']")
except: except:
pass pass
try: try:
return self.getAll(htmltree, self.expr_studio2).strip(" ['']") return self.getTreeAll(htmltree, self.expr_studio2).strip(" ['']")
except: except:
return '' return ''
@@ -207,17 +207,17 @@ class Javdb(Parser):
def getTags(self, htmltree): def getTags(self, htmltree):
try: try:
return self.getAll(htmltree, self.expr_tags) return self.getTreeAll(htmltree, self.expr_tags)
except: except:
pass pass
try: try:
return self.getAll(htmltree, self.expr_tags2) return self.getTreeAll(htmltree, self.expr_tags2)
except: except:
return '' return ''
def getUserRating(self, htmltree): def getUserRating(self, htmltree):
try: try:
result = str(self.getTreeIndex(htmltree, self.expr_userrating)) result = str(self.getTreeElement(htmltree, self.expr_userrating))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0]) return float(v[0][0])
except: except:
@@ -225,7 +225,7 @@ class Javdb(Parser):
def getUserVotes(self, htmltree): def getUserVotes(self, htmltree):
try: try:
result = str(self.getTreeIndex(htmltree, self.expr_uservotes)) result = str(self.getTreeElement(htmltree, self.expr_uservotes))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return int(v[0][1]) return int(v[0][1])
except: except:
@@ -237,7 +237,7 @@ class Javdb(Parser):
return img_url[0] if img_url else '' return img_url[0] if img_url else ''
def getActorPhoto(self, htmltree): def getActorPhoto(self, htmltree):
actorall = self.getAll(htmltree, self.expr_actorphoto) actorall = self.getTreeAll(htmltree, self.expr_actorphoto)
if not actorall: if not actorall:
return {} return {}
actors = self.getActors(htmltree) actors = self.getActors(htmltree)

View File

@@ -22,7 +22,7 @@ class Madou(Parser):
if self.htmlcode == 404: if self.htmlcode == 404:
return 404 return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
self.detailurl = self.getTreeIndex(htmltree, self.expr_url) self.detailurl = self.getTreeElement(htmltree, self.expr_url)
result = self.dictformat(htmltree) result = self.dictformat(htmltree)
return result return result

View File

@@ -49,8 +49,8 @@ class Mgstage(Parser):
return super().getCover(self.htmlcodetree) return super().getCover(self.htmlcodetree)
def getTags(self, htmltree): def getTags(self, htmltree):
result1 = str(self.getAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n') result1 = str(self.getTreeAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(self.getAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n') result2 = str(self.getTreeAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',') result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result return result
@@ -65,7 +65,7 @@ class Mgstage(Parser):
return extrafanart_imgs return extrafanart_imgs
return '' return ''
def getTreeIndex(self, tree, expr, index=0): def getTreeElement(self, tree, expr, index=0):
if expr == '': if expr == '':
return '' return ''
if tree == self.detailtree: if tree == self.detailtree:

View File

@@ -62,11 +62,11 @@ class Mv91(Parser):
return '91制片厂' return '91制片厂'
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getAll(htmltree, self.expr_tags) return self.getTreeAll(htmltree, self.expr_tags)
def getActors(self, htmltree): def getActors(self, htmltree):
b=[] b=[]
for player in self.getAll(htmltree, self.expr_actor): for player in self.getTreeAll(htmltree, self.expr_actor):
player = player.replace('主演:','') player = player.replace('主演:','')
if '/' in player: if '/' in player:
player = player.split('/')[0] player = player.split('/')[0]

View File

@@ -144,18 +144,18 @@ class Parser:
def getNum(self, htmltree): def getNum(self, htmltree):
""" 增加 strip 过滤 """ 增加 strip 过滤
""" """
return self.getTreeIndex(htmltree, self.expr_number) return self.getTreeElement(htmltree, self.expr_number)
def getTitle(self, htmltree): def getTitle(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_title).strip() return self.getTreeElement(htmltree, self.expr_title).strip()
def getStudio(self, htmltree): def getStudio(self, htmltree):
try: try:
return self.getTreeIndex(htmltree, self.expr_studio).strip(" ['']") return self.getTreeElement(htmltree, self.expr_studio).strip(" ['']")
except: except:
pass pass
try: try:
return self.getTreeIndex(htmltree, self.expr_studio2).strip(" ['']") return self.getTreeElement(htmltree, self.expr_studio2).strip(" ['']")
except: except:
return '' return ''
@@ -170,90 +170,90 @@ class Parser:
def getRuntime(self, htmltree): def getRuntime(self, htmltree):
try: try:
return self.getTreeIndex(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi') return self.getTreeElement(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi')
except: except:
pass pass
try: try:
return self.getTreeIndex(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi') return self.getTreeElement(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi')
except: except:
return '' return ''
def getRelease(self, htmltree): def getRelease(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_release).strip().replace('/','-') return self.getTreeElement(htmltree, self.expr_release).strip().replace('/','-')
def getOutline(self, htmltree): def getOutline(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_outline).strip().replace("\n","") return self.getTreeElement(htmltree, self.expr_outline).strip().replace("\n","")
def getDirector(self, htmltree): def getDirector(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_director) return self.getTreeElement(htmltree, self.expr_director)
def getActors(self, htmltree): def getActors(self, htmltree):
return self.getAll(htmltree, self.expr_actor) return self.getTreeAll(htmltree, self.expr_actor)
def getTags(self, htmltree): def getTags(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_tags) return self.getTreeElement(htmltree, self.expr_tags)
def getLabel(self, htmltree): def getLabel(self, htmltree):
try: try:
return self.getTreeIndex(htmltree, self.expr_label).strip(" ['']") return self.getTreeElement(htmltree, self.expr_label).strip(" ['']")
except: except:
pass pass
try: try:
return self.getTreeIndex(htmltree, self.expr_label2).strip(" ['']") return self.getTreeElement(htmltree, self.expr_label2).strip(" ['']")
except: except:
return '' return ''
def getSeries(self, htmltree): def getSeries(self, htmltree):
try: try:
return self.getTreeIndex(htmltree, self.expr_series).strip(" ['']") return self.getTreeElement(htmltree, self.expr_series).strip(" ['']")
except: except:
pass pass
try: try:
return self.getTreeIndex(htmltree, self.expr_series2).strip(" ['']") return self.getTreeElement(htmltree, self.expr_series2).strip(" ['']")
except: except:
return '' return ''
def getCover(self, htmltree): def getCover(self, htmltree):
try: try:
return self.getTreeIndex(htmltree, self.expr_cover).strip(" ['']") return self.getTreeElement(htmltree, self.expr_cover).strip(" ['']")
except: except:
pass pass
try: try:
return self.getTreeIndex(htmltree, self.expr_cover2).strip(" ['']") return self.getTreeElement(htmltree, self.expr_cover2).strip(" ['']")
except: except:
return '' return ''
def getSmallCover(self, htmltree): def getSmallCover(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_smallcover) return self.getTreeElement(htmltree, self.expr_smallcover)
def getExtrafanart(self, htmltree): def getExtrafanart(self, htmltree):
return self.getAll(htmltree, self.expr_extrafanart) return self.getTreeAll(htmltree, self.expr_extrafanart)
def getTrailer(self, htmltree): def getTrailer(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_trailer) return self.getTreeElement(htmltree, self.expr_trailer)
def getActorPhoto(self, htmltree): def getActorPhoto(self, htmltree):
return self.getAll(htmltree, self.expr_actorphoto) return self.getTreeAll(htmltree, self.expr_actorphoto)
def getUncensored(self, htmlree): def getUncensored(self, htmlree):
if self.expr_uncensored: if self.expr_uncensored:
u = self.getAll(htmlree, self.expr_uncensored) u = self.getTreeAll(htmlree, self.expr_uncensored)
return bool(u) return bool(u)
else: else:
return self.uncensored return self.uncensored
def getUserRating(self, htmltree): def getUserRating(self, htmltree):
return self.getAll(htmltree, self.expr_userrating) return self.getTreeAll(htmltree, self.expr_userrating)
def getUserVotes(self, htmltree): def getUserVotes(self, htmltree):
return self.getAll(htmltree, self.expr_uservotes) return self.getTreeAll(htmltree, self.expr_uservotes)
def getTreeIndex(self, tree: html.HtmlElement, expr, index=0): def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
""" """
return getTreeElement(tree, expr, index) return getTreeElement(tree, expr, index)
def getAll(self, tree: html.HtmlElement, expr): def getTreeAll(self, tree: html.HtmlElement, expr):
""" 根据表达式从`xmltree`中获取全部匹配值 """ 根据表达式从`xmltree`中获取全部匹配值
""" """
return getTreeAll(tree, expr) return getTreeAll(tree, expr)

View File

@@ -31,10 +31,10 @@ class Tmdb(Parser):
return movieUrl return movieUrl
def getTitle(self, htmltree): def getTitle(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_title).get('content') return self.getTreeElement(htmltree, self.expr_title).get('content')
def getCover(self, htmltree): def getCover(self, htmltree):
return "https://www.themoviedb.org" + self.getTreeIndex(htmltree, self.expr_cover).get('content') return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content')
def getOutline(self, htmltree): def getOutline(self, htmltree):
return self.getTreeIndex(htmltree, self.expr_outline).get('content') return self.getTreeElement(htmltree, self.expr_outline).get('content')

View File

@@ -28,17 +28,17 @@ class Xcity(Parser):
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
def getRuntime(self, htmltree): def getRuntime(self, htmltree):
return self.getAll(htmltree, self.expr_runtime)[1].strip() return self.getTreeAll(htmltree, self.expr_runtime)[1].strip()
def getRelease(self, htmltree): def getRelease(self, htmltree):
try: try:
result = self.getTreeIndex(htmltree, self.expr_release, 1) result = self.getTreeElement(htmltree, self.expr_release, 1)
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
except: except:
return '' return ''
def getTags(self, htmltree): def getTags(self, htmltree):
result = self.getAll(htmltree, self.expr_tags) result = self.getTreeAll(htmltree, self.expr_tags)
total = [] total = []
for i in result: for i in result:
total.append(i.replace("\n","").replace("\t","")) total.append(i.replace("\n","").replace("\t",""))