update scrapinglib

- 优化提取extrafanart,trailer等,直接使用xpath expr,不需要正则匹配
- 优化 getchu 获取cover方法,直接使用og标签信息
- 优化 www.getchu 识别 getchu-id 的资源
- 统一获取 tag 方法,返回值 list
This commit is contained in:
Mathhew
2022-06-15 14:23:49 +08:00
parent eed33408a8
commit 0dda035057
16 changed files with 107 additions and 218 deletions

View File

@@ -35,7 +35,7 @@ class wwwGetchu(Parser):
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()'
expr_cover = "/html/body/div[1]/table[2]/tr[1]/td/a/@href"
expr_cover = '//head/meta[@property="og:image"]'
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
@@ -47,8 +47,12 @@ class wwwGetchu(Parser):
expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
def queryNumberUrl(self, number):
self.number = quote(number, encoding="euc_jp")
queryUrl = self.GETCHU_WWW_SEARCH_URL.replace("_WORD_", self.number)
if 'GETCHU' in number.upper():
idn = re.findall('\d+',number)[0]
return "http://www.getchu.com/soft.phtml?id=" + idn
else:
self.number = quote(number, encoding="euc_jp")
queryUrl = self.GETCHU_WWW_SEARCH_URL.replace("_WORD_", self.number)
# NOTE dont know why will try 2 times
retry = 2
for i in range(retry):
@@ -64,14 +68,11 @@ class wwwGetchu(Parser):
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
def getCover(self, htmltree):
return "http://www.getchu.com" + super().getCover(htmltree).replace("./", '/')
return self.getTreeElement(htmltree, self.expr_cover).get('content')
def getActors(self, htmltree):
return super().getDirector(htmltree)
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getOutline(self, htmltree):
outline = ''
_list = self.getTreeAll(htmltree, self.expr_outline)
@@ -109,7 +110,6 @@ class dlGetchu(wwwGetchu):
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"
expr_cover = "//td[contains(@bgcolor,'#ffffff')]/img/@src"
expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()"
expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
expr_label = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
@@ -135,9 +135,6 @@ class dlGetchu(wwwGetchu):
def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.number)[0]
def getCover(self, htmltree):
return "https://dl.getchu.com" + super().getCover(htmltree)
def extradict(self, dic: dict):
return dic