From b7ecb66210421727d046f8e766341f9f35619115 Mon Sep 17 00:00:00 2001 From: Mathhew Date: Thu, 26 May 2022 14:03:58 +0800 Subject: [PATCH] add scrapinglib --- scrapinglib/__init__.py | 3 + scrapinglib/airav.py | 133 +++++++++++++++++ scrapinglib/api.py | 173 ++++++++++++++++++++++ scrapinglib/avsox.py | 78 ++++++++++ scrapinglib/carib.py | 101 +++++++++++++ scrapinglib/dlsite.py | 96 ++++++++++++ scrapinglib/fanza.py | 152 +++++++++++++++++++ scrapinglib/fc2.py | 73 ++++++++++ scrapinglib/gcolle.py | 75 ++++++++++ scrapinglib/getchu.py | 8 + scrapinglib/httprequest.py | 251 ++++++++++++++++++++++++++++++++ scrapinglib/jav321.py | 85 +++++++++++ scrapinglib/javbus.py | 145 ++++++++++++++++++ scrapinglib/javdb.py | 229 +++++++++++++++++++++++++++++ scrapinglib/madou.py | 65 +++++++++ scrapinglib/mgstage.py | 83 +++++++++++ scrapinglib/mv91.py | 93 ++++++++++++ scrapinglib/parser.py | 259 +++++++++++++++++++++++++++++++++ scrapinglib/storyline.py | 291 +++++++++++++++++++++++++++++++++++++ scrapinglib/xcity.py | 122 ++++++++++++++++ 20 files changed, 2515 insertions(+) create mode 100644 scrapinglib/__init__.py create mode 100644 scrapinglib/airav.py create mode 100644 scrapinglib/api.py create mode 100644 scrapinglib/avsox.py create mode 100644 scrapinglib/carib.py create mode 100644 scrapinglib/dlsite.py create mode 100644 scrapinglib/fanza.py create mode 100644 scrapinglib/fc2.py create mode 100644 scrapinglib/gcolle.py create mode 100644 scrapinglib/getchu.py create mode 100644 scrapinglib/httprequest.py create mode 100644 scrapinglib/jav321.py create mode 100644 scrapinglib/javbus.py create mode 100644 scrapinglib/javdb.py create mode 100644 scrapinglib/madou.py create mode 100644 scrapinglib/mgstage.py create mode 100644 scrapinglib/mv91.py create mode 100644 scrapinglib/parser.py create mode 100644 scrapinglib/storyline.py create mode 100644 scrapinglib/xcity.py diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py new file mode 100644 index 0000000..9f2e761 --- /dev/null +++ b/scrapinglib/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +from .api import search diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py new file mode 100644 index 0000000..eff264d --- /dev/null +++ b/scrapinglib/airav.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- + +import json +import re +from lxml import etree +from bs4 import BeautifulSoup +from .parser import Parser +from .javbus import Javbus + +class Airav(Parser): + source = 'airav' + + expr_title = '/html/head/title/text()' + expr_number = '/html/head/title/text()' + expr_studio = '//a[contains(@href,"?video_factory=")]/text()' + expr_release = '//li[contains(text(),"發片日期")]/text()' + expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)" + expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()' + expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src' + + def search(self, number, core: None): + self.number = number + self.updateCore(core) + + self.detailurl = 'https://cn.airav.wiki/video/' + number + engine = Javbus() + javbusinfo = engine.search(number, core) + if javbusinfo == 404: + self.javbus = {"title": ""} + else: + self.javbus = json.loads(javbusinfo) + self.htmlcode = self.getHtml(self.detailurl) + htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + # return super().getNum(htmltree) + result = self.javbus.get('number') + if isinstance(result, str) and len(result): + return result + number = super().getNum(htmltree) + result = str(re.findall('^\[(.*?)]', number)[0]) + return result + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() + return result + + def getStudio(self, htmltree): + result = self.javbus.get('studio') + if isinstance(result, str) and len(result): + return result + return super().getStudio(htmltree) + + def getRelease(self, htmltree): + result = self.javbus.get('release') + if isinstance(result, str) and len(result): + return result + try: + return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group() + except: + return '' + + def getYear(self, htmltree): + result = self.javbus.get('year') + if isinstance(result, str) and len(result): + return result + release = self.getRelease(htmltree) + return str(re.findall('\d{4}', release)).strip(" ['']") + + def getOutline(self, htmltree): + return self.getAll(htmltree, self.expr_outline).replace('\n','').strip() + + def getRuntime(self, htmltree): + result = self.javbus.get('runtime') + if isinstance(result, str) and len(result): + return result + return '' + + def getDirector(self, htmltree): + result = self.javbus.get('director') + if isinstance(result, str) and len(result): + return result + return '' + + def getActors(self, htmltree): + b=[] + a = super().getActors(htmltree) + for v in a: + v = v.strip() + if len(v): + b.append(v) + if len(b): + return b + result = self.javbus.get('actor') + if isinstance(result, list) and len(result): + return result + return [] + + def getCover(self, htmltree): + result = self.javbus.get('cover') + if isinstance(result, str) and len(result): + return result + return super().getCover(htmltree) + + def getExtrafanart(self, htmltree): + html_pather = re.compile(r'
[\s\S]*?
') + html = html_pather.search(self.htmlcode) + if html: + html = html.group() + extrafanart_pather = re.compile(r' bool: # 元数据获取失败检测 + if "title" not in data or "number" not in data: + return False + if data["title"] is None or data["title"] == "" or data["title"] == "null": + return False + if data["number"] is None or data["number"] == "" or data["number"] == "null": + return False + return True diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py new file mode 100644 index 0000000..79a0404 --- /dev/null +++ b/scrapinglib/avsox.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +import re +from .parser import Parser + + +class Avsox(Parser): + + source = 'avsox' + imagecut = 3 + + expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()' + expr_actor = '//a[@class="avatar-box"]' + expr_actorphoto = '//a[@class="avatar-box"]' + expr_title = '/html/body/div[2]/h3/text()' + expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()' + expr_release = '//span[contains(text(),"发行时间:")]/../text()' + expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src' + expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src' + expr_tags = '/html/head/meta[@name="keywords"]/@content' + expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()' + expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()' + + def queryNumberUrl(self, number): + qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox') + site = self.getTreeIndex(qurySiteTree, '//div[@class="container"]/div/a/@href') + self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) + result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + if result1 == '' or result1 == 'null' or result1 == 'None': + self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_')) + result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + if result1 == '' or result1 == 'null' or result1 == 'None': + self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', '')) + result1 = self.getTreeIndex(self.searchtree, '//*[@id="waterfall"]/div/a/@href') + return "https:" + result1 + + def getNum(self, htmltree): + new_number = self.getTreeIndex(htmltree, self.expr_number) + if new_number.upper() != self.number.upper(): + raise ValueError('number not found in ' + self.source) + self.number = new_number + return new_number + + def getTitle(self, htmltree): + return super().getTitle(htmltree).replace('/', '').strip(self.number) + + def getStudio(self, htmltree): + return super().getStudio(htmltree).replace("', '", ' ') + + def getSmallCover(self, htmltree): + """ 使用搜索页面的预览小图 + """ + return self.getTreeIndex(self.searchtree, self.expr_smallcover) + + def getTags(self, htmltree): + tags = super().getTags(htmltree).split(',') + return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] + + def getOutline(self, htmltree): + from .storyline import getStoryline + return getStoryline(self.number) + + def getActors(self, htmltree): + a = super().getActors(htmltree) + d = [] + for i in a: + d.append(i.find('span').text) + return d + + def getActorPhoto(self, htmltree): + a = super().getActorPhoto(htmltree) + d = {} + for i in a: + l = i.find('.//img').attrib['src'] + t = i.find('span').text + p2 = {t: l} + d.update(p2) + return d diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py new file mode 100644 index 0000000..38fad8a --- /dev/null +++ b/scrapinglib/carib.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +import re +from urllib.parse import urljoin +from lxml import html +from .parser import Parser + + +class Carib(Parser): + source = 'carib' + uncensored = True + + expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()" + expr_release = "//li[2]/span[@class='spec-content']/text()" + expr_runtime = "//span[@class='spec-content']/span[@itemprop='duration']/text()" + expr_actor = "//span[@class='spec-content']/a[@itemprop='actor']/span/text()" + expr_tags = "//span[@class='spec-content']/a[@itemprop='genre']/text()" + expr_extrafanart = "//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href" + expr_label = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" + expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" + expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()" + + def search(self, number, core: None): + self.number = number + self.updateCore(core) + + self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' + htmlcode = self.getHtml(self.detailurl) + if htmlcode == 404 or 'class="movie-info section"' not in htmlcode: + return 404 + htmltree = html.fromstring(htmlcode) + result = self.dictformat(htmltree) + return result + + def getStudio(self, htmltree): + return '加勒比' + + def getActors(self, htmltree): + r = [] + actors = super().getActors(htmltree) + for act in actors: + if str(act) != '他': + r.append(act) + return r + + def getNum(self, htmltree): + return self.number + + def getCover(self, htmltree): + return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg' + + def getTags(self, htmltree): + return self.getAll(htmltree, self.expr_tags) + + def getExtrafanart(self, htmltree): + r = [] + genres = self.getAll(htmltree, self.expr_extrafanart) + for g in genres: + jpg = str(g) + if '/member/' in jpg: + break + else: + r.append('https://www.caribbeancom.com' + jpg) + return r + + def getActorPhoto(self, htmltree): + # return super().getActorPhoto(htmltree) + htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']") + names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()") + t = {} + for name, a in zip(names, htmla): + if name.strip() == '他': + continue + p = {name.strip(): a.attrib['href']} + t.update(p) + o = {} + for k, v in t.items(): + if '/search_act/' not in v: + continue + r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object') + if not r.ok: + continue + html = r.text + pos = html.find('.full-bg') + if pos<0: + continue + css = html[pos:pos+100] + cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) + if not cssBGjpgs or not len(cssBGjpgs[0]): + continue + p = {k: urljoin(r.url, cssBGjpgs[0])} + o.update(p) + return o + + def getOutline(self, htmltree): + from .storyline import getStoryline + result = getStoryline(self.number, uncensored=self.uncensored) + if len(result): + return result + return super().getOutline(htmltree) + diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py new file mode 100644 index 0000000..40f8caf --- /dev/null +++ b/scrapinglib/dlsite.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from .parser import Parser + + +class Dlsite(Parser): + source = 'dlsite' + imagecut = 4 + + expr_title = '/html/head/title/text()' + expr_actor = '//th[contains(text(),"声优")]/../td/a/text()' + expr_studio = '//th[contains(text(),"商标名")]/../td/span[1]/a/text()' + expr_studio2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' + expr_runtime = '//strong[contains(text(),"時長")]/../span/text()' + expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()' + expr_outline = '//*[@class="work_parts_area"]/p/text()' + expr_series = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()' + expr_series2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' + expr_director = '//th[contains(text(),"剧情")]/../td/a/text()' + expr_release = '//th[contains(text(),"贩卖日")]/../td/a/text()' + expr_cover = '//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset' + expr_tags = '//th[contains(text(),"分类")]/../td/div/a/text()' + expr_label = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()' + expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' + expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src' + + def search(self, number, core: None): + self.updateCore(core) + self.cookies = {'locale': 'zh-cn'} + + if "RJ" in number or "VJ" in number: + self.number = number.upper() + self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN' + htmltree = self.getHtmlTree(self.detailurl) + else: + self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie' + htmltree = self.getHtmlTree(self.detailurl) + search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + if len(search_result) == 0: + number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","") + htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') + search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + if len(search_result) == 0: + if "~" in number: + number = number.replace("~","〜") + elif "〜" in number: + number = number.replace("〜","~") + htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') + search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + if len(search_result) == 0: + number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '') + htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie') + search_result = self.getAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href') + self.detailurl = search_result[0] + htmltree = self.getHtmlTree(self.detailurl) + self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']") + + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + return self.number + + def getTitle(self, htmltree): + result = super().getTitle(htmltree) + result = result[:result.rfind(' | DLsite')] + result = result[:result.rfind(' [')] + result = result.replace('【HD版】', '') + return result + + def getOutline(self, htmltree): + total = [] + result = self.getAll(htmltree, self.expr_outline) + for i in result: + total.append(i.strip('\r\n')) + return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '") + + def getRelease(self, htmltree): + return super().getRelease(htmltree).replace('年','-').replace('月','-').replace('日','') + + def getCover(self, htmltree): + return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg') + + def getTags(self, htmltree): + return self.getAll(htmltree, self.expr_tags) + + def getExtrafanart(self, htmltree): + try: + result = [] + for i in self.getAll(self.expr_extrafanart): + result.append("https:" + i) + except: + result = '' + return result diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py new file mode 100644 index 0000000..b8d4569 --- /dev/null +++ b/scrapinglib/fanza.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urlencode +from .parser import Parser + + +class Fanza(Parser): + source = 'fanza' + + expr_title = '//*[starts-with(@id, "title")]/text()' + expr_outline = "//div[@class='mg-b20 lh4']/text()" + expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" + expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" + + def search(self, number, core: None): + self.number = number + self.updateCore(core) + + # fanza allow letter + number + underscore, normalize the input here + # @note: I only find the usage of underscore as h_test123456789 + fanza_search_number = number + # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix + if fanza_search_number.startswith("h-"): + fanza_search_number = fanza_search_number.replace("h-", "h_") + + fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower() + + fanza_urls = [ + "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/mono/anime/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=", + "https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=", + "https://www.dmm.co.jp/rental/-/detail/=/cid=", + ] + + for url in fanza_urls: + self.detailurl = url + fanza_search_number + url = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl}) + self.htmlcode = self.getHtml(url) + if self.htmlcode != 404: + self.htmltree = etree.HTML(self.htmlcode) + break + if self.htmlcode == 404: + return 404 + result = self.dictformat(self.htmltree) + return result + + def getNum(self, htmltree): + # for some old page, the input number does not match the page + # for example, the url will be cid=test012 + # but the hinban on the page is test00012 + # so get the hinban first, and then pass it to following functions + self.fanza_hinban = self.getFanzaString('品番:') + self.number = self.fanza_hinban + number_lo = self.number.lower() + if (re.sub('-|_', '', number_lo) == self.fanza_hinban or + number_lo.replace('-', '00') == self.fanza_hinban or + number_lo.replace('-', '') + 'so' == self.fanza_hinban + ): + self.number = self.number + return self.number + + def getStudio(self, htmltree): + return self.getFanzaString('メーカー') + + def getOutline(self, htmltree): + try: + result = self.getTreeIndex(htmltree, self.expr_outline).replace("\n", "") + if result == '': + result = self.getTreeIndex(htmltree, self.expr_outline2).replace("\n", "") + return result + except: + return '' + + def getRuntime(self, htmltree): + return str(re.search(r'\d+', super().getRuntime(htmltree)).group()).strip(" ['']") + + def getDirector(self, htmltree): + if "anime" not in self.detailurl: + return self.getFanzaString('監督:') + return '' + + def getActors(self, htmltree): + if "anime" not in self.detailurl: + return super().getActors(htmltree).replace("', '", ",") + return '' + + def getRelease(self, htmltree): + result = self.getFanzaString('発売日:') + if result == '' or result == '----': + result = self.getFanzaString('配信開始日:') + return result.replace("/", "-").strip('\\n') + + def getCover(self, htmltree): + # return super().getCover(htmltree) + cover_number = self.fanza_hinban + try: + result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href') + except: + # sometimes fanza modify _ to \u0005f for image id + if "_" in cover_number: + cover_number = cover_number.replace("_", r"\u005f") + try: + result = self.getTreeIndex(htmltree, '//*[@id="' + cover_number + '"]/@href') + except: + # (TODO) handle more edge case + # print(html) + # raise exception here, same behavior as before + # people's major requirement is fetching the picture + raise ValueError("can not find image") + return result + + def getTags(self, htmltree): + return self.getFanzaStrings('ジャンル:') + + def getExtrafanart(self, htmltree): + html_pather = re.compile(r'
\n') + html = html_pather.search(self.htmlcode) + if html: + html = html.group() + extrafanart_pather = re.compile(r' 0: + return result1 + result2 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()") + return result2 diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py new file mode 100644 index 0000000..24e1c9b --- /dev/null +++ b/scrapinglib/fc2.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from urllib.parse import urljoin + +from .parser import Parser + + +class Fc2(Parser): + source = 'fc2' + imagecut = 0 + + expr_title = '/html/head/title/text()' + expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' + expr_release = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()' + expr_runtime = "//p[@class='items_article_info']/text()" + expr_director = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' + expr_actor = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' + expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src" + expr_tags = "//a[@class='tag tagTag']/text()" + + def search(self, number, core: None): + self.number = number.replace('FC2-', '').replace('fc2-', '') + self.updateCore(core) + + self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' + self.htmlcode = self.getHtml(self.detailurl) + if self.htmlcode == 404: + return 404 + htmltree = etree.HTML(self.htmlcode) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + return 'FC2-' + self.number + + def getRelease(self, htmltree): + return super().getRelease(htmltree).strip(" ['販売日 : ']").replace('/','-') + + def getActors(self, htmltree): + actors = super().getActors(htmltree) + if not actors: + actors = '素人' + return actors + + def getCover(self, htmltree): + return urljoin('https://adult.contents.fc2.com', super().getCover(htmltree)) + + def getExtrafanart(self, htmltree): + html_pather = re.compile(r'