From 793ef89f2283c86d49db9ca4cd13e8d5c2f2cd28 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Mon, 22 Aug 2022 10:39:41 +0800 Subject: [PATCH] update init --- scrapinglib/airav.py | 8 +++++--- scrapinglib/avsox.py | 7 ++++--- scrapinglib/carib.py | 4 +++- scrapinglib/dlsite.py | 6 ++++-- scrapinglib/fc2.py | 4 +++- scrapinglib/gcolle.py | 10 +++++----- scrapinglib/getchu.py | 33 ++++++++++++++++---------------- scrapinglib/javdb.py | 7 ++++--- scrapinglib/javlibrary.py | 5 +++-- scrapinglib/madou.py | 6 ++++-- scrapinglib/mv91.py | 6 ++++-- scrapinglib/parser.py | 40 +++++++++++++++++++++------------------ 12 files changed, 78 insertions(+), 58 deletions(-) diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py index f0e2e39..e157014 100644 --- a/scrapinglib/airav.py +++ b/scrapinglib/airav.py @@ -8,9 +8,6 @@ from .javbus import Javbus class Airav(Parser): source = 'airav' - # for javbus - specifiedSource = None - addtion_Javbus = True expr_title = '/html/head/title/text()' expr_number = '/html/head/title/text()' @@ -22,6 +19,11 @@ class Airav(Parser): expr_tags = '//div[@class="tagBtnMargin"]/a/text()' expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href' + def extraInit(self): + # for javbus + self.specifiedSource = None + self.addtion_Javbus = True + def search(self, number): self.number = number if self.specifiedUrl: diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index 7788e13..f3b4a2e 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -4,10 +4,7 @@ from .parser import Parser class Avsox(Parser): - source = 'avsox' - imagecut = 3 - originalnum = '' expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()' expr_actor = '//a[@class="avatar-box"]' @@ -21,6 +18,10 @@ class Avsox(Parser): expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()' expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()' + def extraInit(self): + self.imagecut = 3 + self.originalnum = '' + def queryNumberUrl(self, number: str): upnum = number.upper() if 'FC2' in upnum and 'FC2-PPV' not in upnum: diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index cc04ae7..0a561de 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -8,7 +8,6 @@ from .parser import Parser class Carib(Parser): source = 'carib' - uncensored = True expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()" expr_release = "//li[2]/span[@class='spec-content']/text()" @@ -20,6 +19,9 @@ class Carib(Parser): expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()" + def extraInit(self): + self.uncensored = True + def search(self, number): self.number = number if self.specifiedUrl: diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index c8415d4..d307f02 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -6,8 +6,6 @@ from .parser import Parser class Dlsite(Parser): source = 'dlsite' - imagecut = 4 - allow_number_change = True expr_title = '/html/head/title/text()' expr_actor = '//th[contains(text(),"声优")]/../td/a/text()' @@ -26,6 +24,10 @@ class Dlsite(Parser): expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src' + def extraInit(self): + self.imagecut = 4 + self.allow_number_change = True + def search(self, number): self.cookies = {'locale': 'zh-cn'} if self.specifiedUrl: diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index 6707682..21629ea 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -9,7 +9,6 @@ from .parser import Parser class Fc2(Parser): source = 'fc2' - imagecut = 0 expr_title = '/html/head/title/text()' expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' @@ -21,6 +20,9 @@ class Fc2(Parser): expr_extrafanart = '//ul[@class="items_article_SampleImagesArea"]/li/a/@href' expr_tags = "//a[@class='tag tagTag']/text()" + def extraInit(self): + self.imagecut = 0 + def search(self, number): self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '') if self.specifiedUrl: diff --git a/scrapinglib/gcolle.py b/scrapinglib/gcolle.py index c6d7027..a4f4f35 100644 --- a/scrapinglib/gcolle.py +++ b/scrapinglib/gcolle.py @@ -8,7 +8,6 @@ from .parser import Parser class Gcolle(Parser): source = 'gcolle' - imagecut = 4 expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href' expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()' @@ -25,8 +24,11 @@ class Gcolle(Parser): expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src' expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src' - def search(self, number): - self.number = number.upper().replace('GCOLLE-','') + def extraInit(self): + self.imagecut = 4 + + def search(self, number: str): + self.number = number.upper().replace('GCOLLE-', '') if self.specifiedUrl: self.detailurl = self.specifiedUrl else: @@ -69,5 +71,3 @@ class Gcolle(Parser): for i in range(len(extrafanart)): extrafanart[i] = 'https:' + extrafanart[i] return extrafanart - - diff --git a/scrapinglib/getchu.py b/scrapinglib/getchu.py index 1372ba8..cd77d77 100644 --- a/scrapinglib/getchu.py +++ b/scrapinglib/getchu.py @@ -28,12 +28,6 @@ class Getchu(): return dic class wwwGetchu(Parser): - imagecut = 0 - allow_number_change = True - - cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"} - GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' - expr_title = '//*[@id="soft-title"]/text()' expr_cover = '//head/meta[@property="og:image"]/@content' expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" @@ -46,6 +40,13 @@ class wwwGetchu(Parser): expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href" expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" + def extraInit(self): + self.imagecut = 0 + self.allow_number_change = True + + self.cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"} + self.GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' + def queryNumberUrl(self, number): if 'GETCHU' in number.upper(): idn = re.findall('\d+',number)[0] @@ -96,16 +97,6 @@ class dlGetchu(wwwGetchu): """ 二者基本一致 headers extrafanart 略有区别 """ - - imagecut = 4 - allow_number_change = True - - cookies = {"adult_check_flag": "1"} - extraheader = {"Referer": "https://dl.getchu.com/"} - - GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1' - GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_' - expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()" expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()" expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" @@ -117,6 +108,16 @@ class dlGetchu(wwwGetchu): expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href" expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" + def extraInit(self): + self.imagecut = 4 + self.allow_number_change = True + + self.cookies = {"adult_check_flag": "1"} + self.extraheader = {"Referer": "https://dl.getchu.com/"} + + self.GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1' + self.GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_' + def queryNumberUrl(self, number): if "item" in number or 'GETCHU' in number.upper(): self.number = re.findall('\d+',number)[0] diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index 0285de8..5a60997 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -11,9 +11,6 @@ from .parser import Parser class Javdb(Parser): source = 'javdb' - fixstudio = False - noauth = False - expr_number = '//strong[contains(text(),"番號")]/../span/text()' expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()' expr_title = "/html/head/title/text()" @@ -44,6 +41,10 @@ class Javdb(Parser): expr_uservotes = '//span[@class="score-stars"]/../text()' expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]' + def extraInit(self): + self.fixstudio = False + self.noauth = False + def updateCore(self, core): if core.proxies: self.proxies = core.proxies diff --git a/scrapinglib/javlibrary.py b/scrapinglib/javlibrary.py index 61bc127..4f3862a 100644 --- a/scrapinglib/javlibrary.py +++ b/scrapinglib/javlibrary.py @@ -8,8 +8,6 @@ from .parser import Parser class Javlibrary(Parser): source = 'javlibrary' - htmltree = None - expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()' expr_title = '//div[@id="video_title"]/h3/a/text()' expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()' @@ -22,6 +20,9 @@ class Javlibrary(Parser): expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()' expr_extrafanart = '//div[@class="previewthumbs"]/img/@src' + def extraInit(self): + self.htmltree = None + def updateCore(self, core): if core.proxies: self.proxies = core.proxies diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index f3ffd20..6e288b6 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -8,14 +8,16 @@ from .parser import Parser class Madou(Parser): source = 'madou' - imagecut = 0 - uncensored = True expr_url = '//a[@class="share-weixin"]/@data-url' expr_title = "/html/head/title/text()" expr_studio = '//a[@rel="category tag"]/text()' expr_tags = '/html/head/meta[@name="keywords"]/@content' + def extraInit(self): + self.imagecut = 0 + self.uncensored = True + def search(self, number): self.number = number.lower().strip() if self.specifiedUrl: diff --git a/scrapinglib/mv91.py b/scrapinglib/mv91.py index 7d589b1..65a7f7e 100644 --- a/scrapinglib/mv91.py +++ b/scrapinglib/mv91.py @@ -8,8 +8,6 @@ from .parser import Parser class Mv91(Parser): source = 'mv91' - imagecut = 0 - uncensored = True expr_number = '//div[@class="player-title"]/text()' expr_title = '//div[@class="player-title"]/text()' @@ -18,6 +16,10 @@ class Mv91(Parser): expr_tags = '//div[@class="player-tag"]/text()' expr_actor = '//p[@class="player-name"]/text()' + def extraInit(self): + self.imagecut = 0 + self.uncensored = True + def getHtmlTree(self, url, type=None): self.htmlcode = self.getHtml(url, type) if self.htmlcode == 404: diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index 0c3f3c6..f77595b 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -11,23 +11,6 @@ class Parser: """ 基础刮削类 """ source = 'base' - # 推荐剪切poster封面: - # `0` 复制cover - # `1` 裁剪cover - # `3` 下载小封面 - imagecut = 1 - uncensored = False - allow_number_change = False - # update - proxies = None - verify = None - extraheader = None - cookies = None - morestoryline = False - specifiedUrl = None - - number = '' - detailurl = '' # xpath expr expr_number = '' expr_title = '' @@ -54,12 +37,33 @@ class Parser: expr_userrating = '' expr_uservotes = '' - def __init__(self) -> None: + def __init__(self): + # 推荐剪切poster封面: + # `0` 复制cover + # `1` 裁剪cover + # `3` 下载小封面 + self.imagecut = 1 + self.uncensored = False + self.allow_number_change = False + # update + self.proxies = None + self.verify = None + self.extraheader = None + self.cookies = None + self.morestoryline = False + self.specifiedUrl = None + self.extraInit() + + def extraInit(self): + """ 自定义初始化内容 + """ pass def scrape(self, number, core: None): """ 刮削番号 """ + # 每次调用,初始化参数 + self.__init__() self.updateCore(core) result = self.search(number) return result