update scrapinglib
This commit is contained in:
@@ -69,7 +69,7 @@ class Airav(Parser):
|
|||||||
return str(re.findall('\d{4}', release)).strip(" ['']")
|
return str(re.findall('\d{4}', release)).strip(" ['']")
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_outline).replace('\n','').strip()
|
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
|
||||||
|
|
||||||
def getRuntime(self, htmltree):
|
def getRuntime(self, htmltree):
|
||||||
result = self.javbus.get('runtime')
|
result = self.javbus.get('runtime')
|
||||||
|
|||||||
@@ -23,19 +23,19 @@ class Avsox(Parser):
|
|||||||
|
|
||||||
def queryNumberUrl(self, number):
|
def queryNumberUrl(self, number):
|
||||||
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
|
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
|
||||||
site = self.getTreeIndex(qurySiteTree, '//div[@class="container"]/div/a/@href')
|
site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
|
||||||
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
|
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
|
||||||
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||||
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
|
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
|
||||||
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||||
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
|
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
|
||||||
result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||||
return "https:" + result1
|
return "https:" + result1
|
||||||
|
|
||||||
def getNum(self, htmltree):
|
def getNum(self, htmltree):
|
||||||
new_number = self.getTreeIndex(htmltree, self.expr_number)
|
new_number = self.getTreeElement(htmltree, self.expr_number)
|
||||||
if new_number.upper() != self.number.upper():
|
if new_number.upper() != self.number.upper():
|
||||||
raise ValueError('number not found in ' + self.source)
|
raise ValueError('number not found in ' + self.source)
|
||||||
self.number = new_number
|
self.number = new_number
|
||||||
@@ -50,7 +50,7 @@ class Avsox(Parser):
|
|||||||
def getSmallCover(self, htmltree):
|
def getSmallCover(self, htmltree):
|
||||||
""" 使用搜索页面的预览小图
|
""" 使用搜索页面的预览小图
|
||||||
"""
|
"""
|
||||||
return self.getTreeIndex(self.searchtree, self.expr_smallcover)
|
return self.getTreeElement(self.searchtree, self.expr_smallcover)
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
tags = super().getTags(htmltree).split(',')
|
tags = super().getTags(htmltree).split(',')
|
||||||
|
|||||||
@@ -48,11 +48,11 @@ class Carib(Parser):
|
|||||||
return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg'
|
return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg'
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_tags)
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getExtrafanart(self, htmltree):
|
||||||
r = []
|
r = []
|
||||||
genres = self.getAll(htmltree, self.expr_extrafanart)
|
genres = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
for g in genres:
|
for g in genres:
|
||||||
jpg = str(g)
|
jpg = str(g)
|
||||||
if '/member/' in jpg:
|
if '/member/' in jpg:
|
||||||
|
|||||||
@@ -36,22 +36,22 @@ class Dlsite(Parser):
|
|||||||
else:
|
else:
|
||||||
self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie'
|
self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie'
|
||||||
htmltree = self.getHtmlTree(self.detailurl)
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
if len(search_result) == 0:
|
if len(search_result) == 0:
|
||||||
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
|
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
|
||||||
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
||||||
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
if len(search_result) == 0:
|
if len(search_result) == 0:
|
||||||
if "~" in number:
|
if "~" in number:
|
||||||
number = number.replace("~","〜")
|
number = number.replace("~","〜")
|
||||||
elif "〜" in number:
|
elif "〜" in number:
|
||||||
number = number.replace("〜","~")
|
number = number.replace("〜","~")
|
||||||
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
||||||
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
if len(search_result) == 0:
|
if len(search_result) == 0:
|
||||||
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
|
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
|
||||||
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
||||||
search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
self.detailurl = search_result[0]
|
self.detailurl = search_result[0]
|
||||||
htmltree = self.getHtmlTree(self.detailurl)
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
|
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
|
||||||
@@ -71,7 +71,7 @@ class Dlsite(Parser):
|
|||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
total = []
|
total = []
|
||||||
result = self.getAll(htmltree, self.expr_outline)
|
result = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
for i in result:
|
for i in result:
|
||||||
total.append(i.strip('\r\n'))
|
total.append(i.strip('\r\n'))
|
||||||
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
||||||
@@ -83,12 +83,12 @@ class Dlsite(Parser):
|
|||||||
return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg')
|
return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg')
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_tags)
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getExtrafanart(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = []
|
result = []
|
||||||
for i in self.getAll(self.expr_extrafanart):
|
for i in self.getTreeAll(self.expr_extrafanart):
|
||||||
result.append("https:" + i)
|
result.append("https:" + i)
|
||||||
except:
|
except:
|
||||||
result = ''
|
result = ''
|
||||||
|
|||||||
@@ -68,9 +68,9 @@ class Fanza(Parser):
|
|||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = self.getTreeIndex(htmltree, self.expr_outline).replace("\n", "")
|
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
|
||||||
if result == '':
|
if result == '':
|
||||||
result = self.getTreeIndex(htmltree, self.expr_outline2).replace("\n", "")
|
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
||||||
return result
|
return result
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
@@ -98,13 +98,13 @@ class Fanza(Parser):
|
|||||||
# return super().getCover(htmltree)
|
# return super().getCover(htmltree)
|
||||||
cover_number = self.fanza_hinban
|
cover_number = self.fanza_hinban
|
||||||
try:
|
try:
|
||||||
result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href')
|
result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href')
|
||||||
except:
|
except:
|
||||||
# sometimes fanza modify _ to \u0005f for image id
|
# sometimes fanza modify _ to \u0005f for image id
|
||||||
if "_" in cover_number:
|
if "_" in cover_number:
|
||||||
cover_number = cover_number.replace("_", r"\u005f")
|
cover_number = cover_number.replace("_", r"\u005f")
|
||||||
try:
|
try:
|
||||||
result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href')
|
result = self.getTreeElement(htmltree, '//*[@id="' + cover_number + '"]/@href')
|
||||||
except:
|
except:
|
||||||
# (TODO) handle more edge case
|
# (TODO) handle more edge case
|
||||||
# print(html)
|
# print(html)
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class Gcolle(Parser):
|
|||||||
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
||||||
htmltree = etree.HTML(htmlcode)
|
htmltree = etree.HTML(htmlcode)
|
||||||
|
|
||||||
r18url = self.getTreeIndex(htmltree, self.expr_r18)
|
r18url = self.getTreeElement(htmltree, self.expr_r18)
|
||||||
if r18url and r18url.startswith('http'):
|
if r18url and r18url.startswith('http'):
|
||||||
htmlcode = session.get(r18url).text
|
htmlcode = session.get(r18url).text
|
||||||
htmltree = etree.HTML(htmlcode)
|
htmltree = etree.HTML(htmlcode)
|
||||||
@@ -46,7 +46,7 @@ class Gcolle(Parser):
|
|||||||
return "GCOLLE-" + str(num)
|
return "GCOLLE-" + str(num)
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
result = self.getAll(htmltree, self.expr_outline)
|
result = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
try:
|
try:
|
||||||
return "\n".join(result)
|
return "\n".join(result)
|
||||||
except:
|
except:
|
||||||
@@ -59,12 +59,12 @@ class Gcolle(Parser):
|
|||||||
return "https:" + super().getCover(htmltree)
|
return "https:" + super().getCover(htmltree)
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_tags)
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getExtrafanart(self, htmltree):
|
||||||
extrafanart = self.getAll(htmltree, self.expr_extrafanart)
|
extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
if len(extrafanart) == 0:
|
if len(extrafanart) == 0:
|
||||||
extrafanart = self.getAll(htmltree, self.expr_extrafanart2)
|
extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart2)
|
||||||
# Add "https:" in each extrafanart url
|
# Add "https:" in each extrafanart url
|
||||||
for i in range(len(extrafanart)):
|
for i in range(len(extrafanart)):
|
||||||
extrafanart[i] = 'https:' + extrafanart[i]
|
extrafanart[i] = 'https:' + extrafanart[i]
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ class wwwGetchu(Parser):
|
|||||||
retry = 2
|
retry = 2
|
||||||
for i in range(retry):
|
for i in range(retry):
|
||||||
queryTree = self.getHtmlTree(queryUrl)
|
queryTree = self.getHtmlTree(queryUrl)
|
||||||
detailurl = self.getTreeIndex(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
|
detailurl = self.getTreeElement(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
|
||||||
if detailurl:
|
if detailurl:
|
||||||
break
|
break
|
||||||
if detailurl == "":
|
if detailurl == "":
|
||||||
@@ -70,11 +70,11 @@ class wwwGetchu(Parser):
|
|||||||
return super().getDirector(htmltree)
|
return super().getDirector(htmltree)
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_tags)
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
outline = ''
|
outline = ''
|
||||||
_list = self.getAll(htmltree, self.expr_outline)
|
_list = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
for i in _list:
|
for i in _list:
|
||||||
outline = outline + i.strip()
|
outline = outline + i.strip()
|
||||||
return outline
|
return outline
|
||||||
@@ -116,7 +116,7 @@ class dlGetchu(wwwGetchu):
|
|||||||
else:
|
else:
|
||||||
queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number)
|
queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number)
|
||||||
queryTree = self.getHtmlTree(queryUrl)
|
queryTree = self.getHtmlTree(queryUrl)
|
||||||
detailurl = self.getTreeIndex(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
|
detailurl = self.getTreeElement(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
|
||||||
if detailurl == "":
|
if detailurl == "":
|
||||||
return None
|
return None
|
||||||
self.number = re.findall('\d+', detailurl)[0]
|
self.number = re.findall('\d+', detailurl)[0]
|
||||||
|
|||||||
@@ -74,10 +74,10 @@ class Jav321(Parser):
|
|||||||
return self.parseElement(super().getActors(htmltree))
|
return self.parseElement(super().getActors(htmltree))
|
||||||
|
|
||||||
def getLabel(self, htmltree):
|
def getLabel(self, htmltree):
|
||||||
return self.parseElement(self.getAll(htmltree, self.expr_label))
|
return self.parseElement(self.getTreeAll(htmltree, self.expr_label))
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.parseElement(self.getAll(htmltree, self.expr_tags))
|
return self.parseElement(self.getTreeAll(htmltree, self.expr_tags))
|
||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
return self.parseElement(self.getAll(htmltree, self.expr_studio))
|
return self.parseElement(self.getTreeAll(htmltree, self.expr_studio))
|
||||||
|
|||||||
@@ -78,9 +78,9 @@ class Javbus(Parser):
|
|||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
if self.uncensored:
|
if self.uncensored:
|
||||||
return self.getTreeIndex(htmltree, self.expr_studio2)
|
return self.getTreeElement(htmltree, self.expr_studio2)
|
||||||
else:
|
else:
|
||||||
return self.getTreeIndex(htmltree, self.expr_studio)
|
return self.getTreeElement(htmltree, self.expr_studio)
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
return urljoin("https://www.javbus.com", super().getCover(htmltree))
|
return urljoin("https://www.javbus.com", super().getCover(htmltree))
|
||||||
@@ -111,15 +111,15 @@ class Javbus(Parser):
|
|||||||
|
|
||||||
def getDirector(self, htmltree):
|
def getDirector(self, htmltree):
|
||||||
if self.uncensored:
|
if self.uncensored:
|
||||||
return self.getTreeIndex(htmltree, self.expr_directorJa)
|
return self.getTreeElement(htmltree, self.expr_directorJa)
|
||||||
else:
|
else:
|
||||||
return self.getTreeIndex(htmltree, self.expr_director)
|
return self.getTreeElement(htmltree, self.expr_director)
|
||||||
|
|
||||||
def getSeries(self, htmltree):
|
def getSeries(self, htmltree):
|
||||||
if self.uncensored:
|
if self.uncensored:
|
||||||
return self.getTreeIndex(htmltree, self.expr_series2)
|
return self.getTreeElement(htmltree, self.expr_series2)
|
||||||
else:
|
else:
|
||||||
return self.getTreeIndex(htmltree, self.expr_series)
|
return self.getTreeElement(htmltree, self.expr_series)
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
tags = super().getTags(htmltree).split(',')
|
tags = super().getTags(htmltree).split(',')
|
||||||
|
|||||||
@@ -86,12 +86,12 @@ class Javdb(Parser):
|
|||||||
# javdb sometime returns multiple results,
|
# javdb sometime returns multiple results,
|
||||||
# and the first elememt maybe not the one we are looking for
|
# and the first elememt maybe not the one we are looking for
|
||||||
# iterate all candidates and find the match one
|
# iterate all candidates and find the match one
|
||||||
urls = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href')
|
urls = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href')
|
||||||
# 记录一下欧美的ids ['Blacked','Blacked']
|
# 记录一下欧美的ids ['Blacked','Blacked']
|
||||||
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
|
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
|
||||||
correct_url = urls[0]
|
correct_url = urls[0]
|
||||||
else:
|
else:
|
||||||
ids = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
|
ids = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
|
||||||
try:
|
try:
|
||||||
self.queryid = ids.index(number)
|
self.queryid = ids.index(number)
|
||||||
correct_url = urls[self.queryid]
|
correct_url = urls[self.queryid]
|
||||||
@@ -105,8 +105,8 @@ class Javdb(Parser):
|
|||||||
def getNum(self, htmltree):
|
def getNum(self, htmltree):
|
||||||
if self.noauth:
|
if self.noauth:
|
||||||
return self.number
|
return self.number
|
||||||
result1 = str(self.getAll(htmltree, self.expr_number)).strip(" ['']")
|
result1 = str(self.getTreeAll(htmltree, self.expr_number)).strip(" ['']")
|
||||||
result2 = str(self.getAll(htmltree, self.expr_number2)).strip(" ['']")
|
result2 = str(self.getTreeAll(htmltree, self.expr_number2)).strip(" ['']")
|
||||||
dp_number = str(result2 + result1).strip('+')
|
dp_number = str(result2 + result1).strip('+')
|
||||||
# NOTE 检测匹配与更新 self.number
|
# NOTE 检测匹配与更新 self.number
|
||||||
if dp_number.upper() != self.number.upper():
|
if dp_number.upper() != self.number.upper():
|
||||||
@@ -116,50 +116,50 @@ class Javdb(Parser):
|
|||||||
|
|
||||||
def getTitle(self, htmltree):
|
def getTitle(self, htmltree):
|
||||||
if self.noauth:
|
if self.noauth:
|
||||||
return self.getTreeIndex(htmltree, self.expr_title_no, self.queryid)
|
return self.getTreeElement(htmltree, self.expr_title_no, self.queryid)
|
||||||
browser_title = super().getTitle(htmltree)
|
browser_title = super().getTitle(htmltree)
|
||||||
title = browser_title[:browser_title.find(' | JavDB')].strip()
|
title = browser_title[:browser_title.find(' | JavDB')].strip()
|
||||||
return title.replace(self.number, '').strip()
|
return title.replace(self.number, '').strip()
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
if self.noauth:
|
if self.noauth:
|
||||||
return self.getTreeIndex(htmltree, self.expr_cover_no, self.queryid)
|
return self.getTreeElement(htmltree, self.expr_cover_no, self.queryid)
|
||||||
return super().getCover(htmltree)
|
return super().getCover(htmltree)
|
||||||
|
|
||||||
def getRelease(self, htmltree):
|
def getRelease(self, htmltree):
|
||||||
if self.noauth:
|
if self.noauth:
|
||||||
return self.getTreeIndex(htmltree, self.expr_release_no, self.queryid).strip()
|
return self.getTreeElement(htmltree, self.expr_release_no, self.queryid).strip()
|
||||||
return super().getRelease(htmltree)
|
return super().getRelease(htmltree)
|
||||||
|
|
||||||
def getRuntime(self, htmltree):
|
def getRuntime(self, htmltree):
|
||||||
result1 = str(self.getAll(htmltree, self.expr_runtime)).strip(" ['']")
|
result1 = str(self.getTreeAll(htmltree, self.expr_runtime)).strip(" ['']")
|
||||||
result2 = str(self.getAll(htmltree, self.expr_runtime2)).strip(" ['']")
|
result2 = str(self.getTreeAll(htmltree, self.expr_runtime2)).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
return str(result1 + result2).strip('+').rstrip('mi')
|
||||||
|
|
||||||
def getDirector(self, htmltree):
|
def getDirector(self, htmltree):
|
||||||
result1 = str(self.getAll(htmltree, self.expr_director)).strip(" ['']")
|
result1 = str(self.getTreeAll(htmltree, self.expr_director)).strip(" ['']")
|
||||||
result2 = str(self.getAll(htmltree, self.expr_director2)).strip(" ['']")
|
result2 = str(self.getTreeAll(htmltree, self.expr_director2)).strip(" ['']")
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
|
|
||||||
def getSeries(self, htmltree):
|
def getSeries(self, htmltree):
|
||||||
result1 = str(self.getAll(htmltree, self.expr_series)).strip(" ['']")
|
result1 = str(self.getTreeAll(htmltree, self.expr_series)).strip(" ['']")
|
||||||
result2 = str(self.getAll(htmltree, self.expr_series2)).strip(" ['']")
|
result2 = str(self.getTreeAll(htmltree, self.expr_series2)).strip(" ['']")
|
||||||
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
if not result and self.fixstudio:
|
if not result and self.fixstudio:
|
||||||
result = self.getStudio(htmltree)
|
result = self.getStudio(htmltree)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def getLabel(self, htmltree):
|
def getLabel(self, htmltree):
|
||||||
result1 = str(self.getAll(htmltree, self.expr_label)).strip(" ['']")
|
result1 = str(self.getTreeAll(htmltree, self.expr_label)).strip(" ['']")
|
||||||
result2 = str(self.getAll(htmltree, self.expr_label2)).strip(" ['']")
|
result2 = str(self.getTreeAll(htmltree, self.expr_label2)).strip(" ['']")
|
||||||
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
if not result and self.fixstudio:
|
if not result and self.fixstudio:
|
||||||
result = self.getStudio(htmltree)
|
result = self.getStudio(htmltree)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def getActors(self, htmltree):
|
def getActors(self, htmltree):
|
||||||
actors = self.getAll(htmltree, self.expr_actor)
|
actors = self.getTreeAll(htmltree, self.expr_actor)
|
||||||
genders = self.getAll(htmltree, self.expr_actor2)
|
genders = self.getTreeAll(htmltree, self.expr_actor2)
|
||||||
r = []
|
r = []
|
||||||
idx = 0
|
idx = 0
|
||||||
# NOTE only female, we dont care others
|
# NOTE only female, we dont care others
|
||||||
@@ -184,11 +184,11 @@ class Javdb(Parser):
|
|||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getAll(htmltree, self.expr_studio).strip(" ['']")
|
return self.getTreeAll(htmltree, self.expr_studio).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getAll(htmltree, self.expr_studio2).strip(" ['']")
|
return self.getTreeAll(htmltree, self.expr_studio2).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@@ -207,17 +207,17 @@ class Javdb(Parser):
|
|||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getAll(htmltree, self.expr_tags)
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getAll(htmltree, self.expr_tags2)
|
return self.getTreeAll(htmltree, self.expr_tags2)
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getUserRating(self, htmltree):
|
def getUserRating(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = str(self.getTreeIndex(htmltree, self.expr_userrating))
|
result = str(self.getTreeElement(htmltree, self.expr_userrating))
|
||||||
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
||||||
return float(v[0][0])
|
return float(v[0][0])
|
||||||
except:
|
except:
|
||||||
@@ -225,7 +225,7 @@ class Javdb(Parser):
|
|||||||
|
|
||||||
def getUserVotes(self, htmltree):
|
def getUserVotes(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = str(self.getTreeIndex(htmltree, self.expr_uservotes))
|
result = str(self.getTreeElement(htmltree, self.expr_uservotes))
|
||||||
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
||||||
return int(v[0][1])
|
return int(v[0][1])
|
||||||
except:
|
except:
|
||||||
@@ -237,7 +237,7 @@ class Javdb(Parser):
|
|||||||
return img_url[0] if img_url else ''
|
return img_url[0] if img_url else ''
|
||||||
|
|
||||||
def getActorPhoto(self, htmltree):
|
def getActorPhoto(self, htmltree):
|
||||||
actorall = self.getAll(htmltree, self.expr_actorphoto)
|
actorall = self.getTreeAll(htmltree, self.expr_actorphoto)
|
||||||
if not actorall:
|
if not actorall:
|
||||||
return {}
|
return {}
|
||||||
actors = self.getActors(htmltree)
|
actors = self.getActors(htmltree)
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class Madou(Parser):
|
|||||||
if self.htmlcode == 404:
|
if self.htmlcode == 404:
|
||||||
return 404
|
return 404
|
||||||
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||||
self.detailurl = self.getTreeIndex(htmltree, self.expr_url)
|
self.detailurl = self.getTreeElement(htmltree, self.expr_url)
|
||||||
|
|
||||||
result = self.dictformat(htmltree)
|
result = self.dictformat(htmltree)
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -49,8 +49,8 @@ class Mgstage(Parser):
|
|||||||
return super().getCover(self.htmlcodetree)
|
return super().getCover(self.htmlcodetree)
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
result1 = str(self.getAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n')
|
result1 = str(self.getTreeAll(htmltree, self.expr_tags)).strip(" ['']").strip('\\n ').strip('\\n')
|
||||||
result2 = str(self.getAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n')
|
result2 = str(self.getTreeAll(htmltree, self.expr_tags2)).strip(" ['']").strip('\\n ').strip('\\n')
|
||||||
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -65,7 +65,7 @@ class Mgstage(Parser):
|
|||||||
return extrafanart_imgs
|
return extrafanart_imgs
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getTreeIndex(self, tree, expr, index=0):
|
def getTreeElement(self, tree, expr, index=0):
|
||||||
if expr == '':
|
if expr == '':
|
||||||
return ''
|
return ''
|
||||||
if tree == self.detailtree:
|
if tree == self.detailtree:
|
||||||
|
|||||||
@@ -62,11 +62,11 @@ class Mv91(Parser):
|
|||||||
return '91制片厂'
|
return '91制片厂'
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_tags)
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
def getActors(self, htmltree):
|
def getActors(self, htmltree):
|
||||||
b=[]
|
b=[]
|
||||||
for player in self.getAll(htmltree, self.expr_actor):
|
for player in self.getTreeAll(htmltree, self.expr_actor):
|
||||||
player = player.replace('主演:','')
|
player = player.replace('主演:','')
|
||||||
if '/' in player:
|
if '/' in player:
|
||||||
player = player.split('/')[0]
|
player = player.split('/')[0]
|
||||||
|
|||||||
@@ -144,18 +144,18 @@ class Parser:
|
|||||||
def getNum(self, htmltree):
|
def getNum(self, htmltree):
|
||||||
""" 增加 strip 过滤
|
""" 增加 strip 过滤
|
||||||
"""
|
"""
|
||||||
return self.getTreeIndex(htmltree, self.expr_number)
|
return self.getTreeElement(htmltree, self.expr_number)
|
||||||
|
|
||||||
def getTitle(self, htmltree):
|
def getTitle(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_title).strip()
|
return self.getTreeElement(htmltree, self.expr_title).strip()
|
||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_studio).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_studio).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_studio2).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_studio2).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@@ -170,90 +170,90 @@ class Parser:
|
|||||||
|
|
||||||
def getRuntime(self, htmltree):
|
def getRuntime(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi')
|
return self.getTreeElement(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi')
|
return self.getTreeElement(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi')
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getRelease(self, htmltree):
|
def getRelease(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_release).strip().replace('/','-')
|
return self.getTreeElement(htmltree, self.expr_release).strip().replace('/','-')
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_outline).strip().replace("\n","")
|
return self.getTreeElement(htmltree, self.expr_outline).strip().replace("\n","")
|
||||||
|
|
||||||
def getDirector(self, htmltree):
|
def getDirector(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_director)
|
return self.getTreeElement(htmltree, self.expr_director)
|
||||||
|
|
||||||
def getActors(self, htmltree):
|
def getActors(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_actor)
|
return self.getTreeAll(htmltree, self.expr_actor)
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_tags)
|
return self.getTreeElement(htmltree, self.expr_tags)
|
||||||
|
|
||||||
def getLabel(self, htmltree):
|
def getLabel(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_label).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_label).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_label2).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_label2).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getSeries(self, htmltree):
|
def getSeries(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_series).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_series).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_series2).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_series2).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_cover).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_cover).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.getTreeIndex(htmltree, self.expr_cover2).strip(" ['']")
|
return self.getTreeElement(htmltree, self.expr_cover2).strip(" ['']")
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getSmallCover(self, htmltree):
|
def getSmallCover(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_smallcover)
|
return self.getTreeElement(htmltree, self.expr_smallcover)
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getExtrafanart(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_extrafanart)
|
return self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
|
|
||||||
def getTrailer(self, htmltree):
|
def getTrailer(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_trailer)
|
return self.getTreeElement(htmltree, self.expr_trailer)
|
||||||
|
|
||||||
def getActorPhoto(self, htmltree):
|
def getActorPhoto(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_actorphoto)
|
return self.getTreeAll(htmltree, self.expr_actorphoto)
|
||||||
|
|
||||||
def getUncensored(self, htmlree):
|
def getUncensored(self, htmlree):
|
||||||
if self.expr_uncensored:
|
if self.expr_uncensored:
|
||||||
u = self.getAll(htmlree, self.expr_uncensored)
|
u = self.getTreeAll(htmlree, self.expr_uncensored)
|
||||||
return bool(u)
|
return bool(u)
|
||||||
else:
|
else:
|
||||||
return self.uncensored
|
return self.uncensored
|
||||||
|
|
||||||
def getUserRating(self, htmltree):
|
def getUserRating(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_userrating)
|
return self.getTreeAll(htmltree, self.expr_userrating)
|
||||||
|
|
||||||
def getUserVotes(self, htmltree):
|
def getUserVotes(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_uservotes)
|
return self.getTreeAll(htmltree, self.expr_uservotes)
|
||||||
|
|
||||||
def getTreeIndex(self, tree: html.HtmlElement, expr, index=0):
|
def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
|
||||||
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
|
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
|
||||||
"""
|
"""
|
||||||
return getTreeElement(tree, expr, index)
|
return getTreeElement(tree, expr, index)
|
||||||
|
|
||||||
def getAll(self, tree: html.HtmlElement, expr):
|
def getTreeAll(self, tree: html.HtmlElement, expr):
|
||||||
""" 根据表达式从`xmltree`中获取全部匹配值
|
""" 根据表达式从`xmltree`中获取全部匹配值
|
||||||
"""
|
"""
|
||||||
return getTreeAll(tree, expr)
|
return getTreeAll(tree, expr)
|
||||||
|
|||||||
@@ -31,10 +31,10 @@ class Tmdb(Parser):
|
|||||||
return movieUrl
|
return movieUrl
|
||||||
|
|
||||||
def getTitle(self, htmltree):
|
def getTitle(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_title).get('content')
|
return self.getTreeElement(htmltree, self.expr_title).get('content')
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
return "https://www.themoviedb.org" + self.getTreeIndex(htmltree, self.expr_cover).get('content')
|
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content')
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
return self.getTreeIndex(htmltree, self.expr_outline).get('content')
|
return self.getTreeElement(htmltree, self.expr_outline).get('content')
|
||||||
|
|||||||
@@ -28,17 +28,17 @@ class Xcity(Parser):
|
|||||||
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
|
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
|
||||||
|
|
||||||
def getRuntime(self, htmltree):
|
def getRuntime(self, htmltree):
|
||||||
return self.getAll(htmltree, self.expr_runtime)[1].strip()
|
return self.getTreeAll(htmltree, self.expr_runtime)[1].strip()
|
||||||
|
|
||||||
def getRelease(self, htmltree):
|
def getRelease(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = self.getTreeIndex(htmltree, self.expr_release, 1)
|
result = self.getTreeElement(htmltree, self.expr_release, 1)
|
||||||
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
|
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
result = self.getAll(htmltree, self.expr_tags)
|
result = self.getTreeAll(htmltree, self.expr_tags)
|
||||||
total = []
|
total = []
|
||||||
for i in result:
|
for i in result:
|
||||||
total.append(i.replace("\n","").replace("\t",""))
|
total.append(i.replace("\n","").replace("\t",""))
|
||||||
|
|||||||
Reference in New Issue
Block a user