diff --git a/core.py b/core.py index 3fe66cc..e2dde98 100644 --- a/core.py +++ b/core.py @@ -272,7 +272,10 @@ def extrafanart_download_threadpool(url_list, save_dir, number, json_data=None): def image_ext(url): try: - return os.path.splitext(url)[-1] + ext = os.path.splitext(url)[-1] + if ext in {'.jpg','.jpge','.bmp','.png','.gif'}: + return ext + return ".jpg" except: return ".jpg" diff --git a/number_parser.py b/number_parser.py index d9f6f7d..fe6af19 100755 --- a/number_parser.py +++ b/number_parser.py @@ -54,7 +54,7 @@ def get_number(debug: bool, file_path: str) -> str: filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 lower_check = filename.lower() if 'fc2' in lower_check: - filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() + filename = lower_check.replace('--', '-').replace('_', '-').upper() filename = re.sub("[-_]cd\d{1,2}", "", filename, flags=re.IGNORECASE) if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group()) diff --git a/scrapinglib/api.py b/scrapinglib/api.py index c2bb7f6..2bb8585 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -21,6 +21,7 @@ from .avsox import Avsox from .javlibrary import Javlibrary from .javday import Javday from .pissplay import Pissplay +from .javmenu import Javmenu from .tmdb import Tmdb from .imdb import Imdb @@ -53,7 +54,7 @@ class Scraping: """ adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', - 'getchu', 'gcolle','javday','pissplay' + 'getchu', 'gcolle','javday','pissplay','javmenu' ] adult_func_mapping = { 'avsox': Avsox().scrape, @@ -72,7 +73,8 @@ class Scraping: 'getchu': Getchu().scrape, 'javlibrary': Javlibrary().scrape, 'javday': Javday().scrape, - 'pissplay': Pissplay().scrape + 'pissplay': Pissplay().scrape, + 'javmenu': Javmenu().scrape } general_full_sources = ['tmdb', 'imdb'] diff --git a/scrapinglib/javmenu.py b/scrapinglib/javmenu.py new file mode 100644 index 0000000..099f314 --- /dev/null +++ b/scrapinglib/javmenu.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urljoin + +from .parser import Parser + + +class Javmenu(Parser): + source = 'javmenu' + + expr_title = '/html/head/meta[@property="og:title"]/@content' + expr_cover = '/html/head/meta[@property="og:image"]/@content' + + expr_number = '//span[contains(text(),"番號") or contains(text(),"番号")]/../a/text()' + expr_number2 = '//span[contains(text(),"番號") or contains(text(),"番号")]/../span[2]/text()' + expr_runtime = '//span[contains(text(),"時長;") or contains(text(),"时长")]/../span[2]/text()' + expr_release = '//span[contains(text(),"日期")]/../span[2]/text()' + expr_studio = '//span[contains(text(),"製作")]/../span[2]/a/text()' + + expr_actor = '//a[contains(@class,"actress")]/text()' + expr_tags = '//a[contains(@class,"genre")]/text()' + + def extraInit(self): + self.imagecut = 4 + self.uncensored = True + + def search(self, number): + self.number = number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://javmenu.com/zh/' + self.number + '/' + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.HTML(self.htmlcode) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + # 番号被分割开,需要合并后才是完整番号 + part1 = self.getTreeElement(htmltree, self.expr_number) + part2 = self.getTreeElement(htmltree, self.expr_number2) + dp_number = part1 + part2 + # NOTE 检测匹配与更新 self.number + if dp_number.upper() != self.number.upper(): + raise Exception(f'[!] {self.number}: find [{dp_number}] in javmenu, not match') + self.number = dp_number + return self.number + + def getTitle(self, htmltree): + browser_title = super().getTitle(htmltree) + # 删除番号 + number = re.findall("\d+",self.number)[1] + title = browser_title.split(number,1)[-1] + title = title.replace(' | JAV目錄大全 | 每日更新',"") + title = title.replace(' | JAV目录大全 | 每日更新',"").strip() + return title.replace(self.number, '').strip() +