From 9898f2918feda70b7822009cc006585686596ade Mon Sep 17 00:00:00 2001 From: Mathhew Date: Fri, 27 May 2022 15:24:29 +0800 Subject: [PATCH] update scrapinglib --- scrapinglib/airav.py | 6 +- scrapinglib/api.py | 50 +++++++++------ scrapinglib/avsox.py | 6 +- scrapinglib/carib.py | 4 +- scrapinglib/dlsite.py | 5 +- scrapinglib/fanza.py | 4 +- scrapinglib/fc2.py | 4 +- scrapinglib/gcolle.py | 4 +- scrapinglib/getchu.py | 125 ++++++++++++++++++++++++++++++++++++- scrapinglib/httprequest.py | 5 +- scrapinglib/javbus.py | 14 ++--- scrapinglib/javdb.py | 21 +++++-- scrapinglib/madou.py | 4 +- scrapinglib/mgstage.py | 4 +- scrapinglib/parser.py | 20 ++++-- scrapinglib/xcity.py | 10 +-- 16 files changed, 213 insertions(+), 73 deletions(-) diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py index eff264d..69f36b0 100644 --- a/scrapinglib/airav.py +++ b/scrapinglib/airav.py @@ -18,13 +18,11 @@ class Airav(Parser): expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()' expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src' - def search(self, number, core: None): + def search(self, number): self.number = number - self.updateCore(core) - self.detailurl = 'https://cn.airav.wiki/video/' + number engine = Javbus() - javbusinfo = engine.search(number, core) + javbusinfo = engine.search(number, self) if javbusinfo == 404: self.javbus = {"title": ""} else: diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 5639fdd..d8d3951 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -8,6 +8,7 @@ from scrapinglib.carib import Carib from scrapinglib.dlsite import Dlsite from scrapinglib.fanza import Fanza from scrapinglib.gcolle import Gcolle +from scrapinglib.getchu import Getchu from scrapinglib.jav321 import Jav321 from scrapinglib.javdb import Javdb from scrapinglib.mv91 import Mv91 @@ -19,13 +20,15 @@ from .xcity import Xcity from .avsox import Avsox -def search(number, souces=None, proxies=None, dbcookies=None): - """ +def search(number, souces=None, proxies=None, verify=None, dbcookies=None, dbsite=None, morestoryline=True): + """ TODO 支持更多网站 douban, imdb,tmdb anidb等 type 区分 r18 与 normal """ sc = Scraping() - return sc.search(number, souces, proxies=proxies, dbcookies=dbcookies) + return sc.search(number, souces, proxies=proxies, verify=verify, + dbcookies=dbcookies, dbsite=dbsite, + morestoryline=morestoryline) class Scraping(): @@ -54,30 +57,39 @@ class Scraping(): full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', 'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91', - 'gcolle', 'javdb'] + 'gcolle', 'javdb', 'getchu'] func_mapping = { - 'avsox': Avsox().search, - 'javbus': Javbus().search, - 'xcity': Xcity().search, - 'mgstage': Mgstage().search, - 'madou': Madou().search, - 'fc2': Fc2().search, - 'dlsite': Dlsite().search, - 'jav321': Jav321().search, - 'fanza': Fanza().search, - 'airav': Airav().search, - 'carib': Carib().search, - 'mv91': Mv91().search, - 'gcolle': Gcolle().search, - 'javdb': Javdb().search, + 'avsox': Avsox().scrape, + 'javbus': Javbus().scrape, + 'xcity': Xcity().scrape, + 'mgstage': Mgstage().scrape, + 'madou': Madou().scrape, + 'fc2': Fc2().scrape, + 'dlsite': Dlsite().scrape, + 'jav321': Jav321().scrape, + 'fanza': Fanza().scrape, + 'airav': Airav().scrape, + 'carib': Carib().scrape, + 'mv91': Mv91().scrape, + 'gcolle': Gcolle().scrape, + 'javdb': Javdb().scrape, + 'getchu': Getchu().scrape, } proxies = None + verify = None dbcookies = None + dbsite = None + # 使用storyline方法进一步获取故事情节 + morestoryline = True - def search(self, number, sources=None, proxies=None, dbcookies=None): + def search(self, number, sources=None, proxies=None, verify=None, + dbcookies=None, dbsite=None, morestoryline=True): self.proxies = proxies + self.verify = verify self.dbcookies = dbcookies + self.dbsite = dbsite + self.morestoryline = morestoryline sources = self.checkSources(sources, number) json_data = {} diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index 79a0404..ffb6723 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -57,8 +57,10 @@ class Avsox(Parser): return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] def getOutline(self, htmltree): - from .storyline import getStoryline - return getStoryline(self.number) + if self.morestoryline: + from .storyline import getStoryline + return getStoryline(self.number) + return '' def getActors(self, htmltree): a = super().getActors(htmltree) diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 38fad8a..415006d 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -20,10 +20,8 @@ class Carib(Parser): expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()" - def search(self, number, core: None): + def search(self, number): self.number = number - self.updateCore(core) - self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' htmlcode = self.getHtml(self.detailurl) if htmlcode == 404 or 'class="movie-info section"' not in htmlcode: diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index 40f8caf..125afe2 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -8,6 +8,7 @@ from .parser import Parser class Dlsite(Parser): source = 'dlsite' imagecut = 4 + allow_number_change = True expr_title = '/html/head/title/text()' expr_actor = '//th[contains(text(),"声优")]/../td/a/text()' @@ -26,10 +27,8 @@ class Dlsite(Parser): expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src' - def search(self, number, core: None): - self.updateCore(core) + def search(self, number): self.cookies = {'locale': 'zh-cn'} - if "RJ" in number or "VJ" in number: self.number = number.upper() self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN' diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index b8d4569..c06d26c 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -14,10 +14,8 @@ class Fanza(Parser): expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" - def search(self, number, core: None): + def search(self, number): self.number = number - self.updateCore(core) - # fanza allow letter + number + underscore, normalize the input here # @note: I only find the usage of underscore as h_test123456789 fanza_search_number = number diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index 24e1c9b..c12a1ce 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -20,10 +20,8 @@ class Fc2(Parser): expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src" expr_tags = "//a[@class='tag tagTag']/text()" - def search(self, number, core: None): + def search(self, number): self.number = number.replace('FC2-', '').replace('fc2-', '') - self.updateCore(core) - self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: diff --git a/scrapinglib/gcolle.py b/scrapinglib/gcolle.py index c1d53fa..93fc58b 100644 --- a/scrapinglib/gcolle.py +++ b/scrapinglib/gcolle.py @@ -25,10 +25,8 @@ class Gcolle(Parser): expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src' expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src' - def search(self, number, core: None): + def search(self, number): self.number = number.upper().replace('GCOLLE-','') - self.updateCore(core) - self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text diff --git a/scrapinglib/getchu.py b/scrapinglib/getchu.py index 707b733..5261ee6 100644 --- a/scrapinglib/getchu.py +++ b/scrapinglib/getchu.py @@ -1,8 +1,129 @@ # -*- coding: utf-8 -*- - +import re +import json +from urllib.parse import quote from .parser import Parser -class Getchu(Parser): +class Getchu(): source = 'getchu' + + def scrape(self, number, core: None): + dl = dlGetchu() + www = wwwGetchu() + number = number.replace("-C", "") + dic = {} + if "item" in number: + sort = ["dl.scrape(number, core)", "www.scrape(number, core)"] + else: + sort = ["www.scrape(number, core)", "dl.scrape(number, core)"] + for i in sort: + try: + dic = eval(i) + if dic != None and json.loads(dic).get('title') != '': + break + except: + pass + return dic + +class wwwGetchu(Parser): + imagecut = 0 + allow_number_change = True + + cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"} + GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' + + expr_title = '//*[@id="soft-title"]/text()' + expr_cover = "/html/body/div[1]/table[2]/tr[1]/td/a/@href" + expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" + expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" + expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" + expr_label = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" + expr_release = "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()" + expr_tags = "//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()" + expr_outline = "//div[contains(text(),'商品紹介')]/following-sibling::div/text()" + expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href" + expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" + + def queryNumberUrl(self, number): + self.number = quote(number, encoding="euc_jp") + queryUrl = self.GETCHU_WWW_SEARCH_URL.replace("_WORD_", self.number) + # NOTE dont know why will try 2 times + retry = 2 + for i in range(retry): + queryTree = self.getHtmlTree(queryUrl) + detailurl = self.getTreeIndex(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href') + if detailurl: + break + if detailurl == "": + return None + return detailurl.replace('../', 'http://www.getchu.com/') + + def getNum(self, htmltree): + return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0] + + def getCover(self, htmltree): + return "http://www.getchu.com" + super().getCover(htmltree).replace("./", '/') + + def getActors(self, htmltree): + return super().getDirector(htmltree) + + def getTags(self, htmltree): + return self.getAll(htmltree, self.expr_tags) + + def getOutline(self, htmltree): + outline = '' + _list = self.getAll(htmltree, self.expr_outline) + for i in _list: + outline = outline + i.strip() + return outline + + def getExtrafanart(self, htmltree): + arts = super().getExtrafanart(htmltree) + extrafanart = [] + for i in arts: + i = "http://www.getchu.com" + i.replace("./", '/') + if 'jpg' in i: + extrafanart.append(i) + return extrafanart + +class dlGetchu(wwwGetchu): + imagecut = 4 + allow_number_change = True + + cookies = {"adult_check_flag": "1"} + extraheader = {"Referer": "https://dl.getchu.com/"} + + GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1' + GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_' + + expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()" + expr_cover = "//td[contains(@bgcolor,'#ffffff')]/img/@src" + expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()" + expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" + expr_label = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" + expr_runtime = "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()" + expr_release = "//td[contains(text(),'配信開始日')]/following-sibling::td/text()" + expr_tags = "//td[contains(text(),'趣向')]/following-sibling::td/a/text()" + expr_outline = "//*[contains(text(),'作品内容')]/following-sibling::td/text()" + expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href" + expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" + + def queryNumberUrl(self, number): + if "item" in number or 'GETCHU' in number.upper(): + self.number = re.findall('\d+',number)[0] + else: + queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number) + queryTree = self.getHtmlTree(queryUrl) + detailurl = self.getTreeIndex(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href') + if detailurl == "": + return None + self.number = re.findall('\d+', detailurl)[0] + return self.GETCHU_DL_URL.replace("_WORD_", self.number) + + def getNum(self, htmltree): + return 'GETCHU-' + re.findall('\d+', self.number)[0] + + def getCover(self, htmltree): + return "https://dl.getchu.com" + super().getCover(htmltree) diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py index adaf5f3..0c677c0 100644 --- a/scrapinglib/httprequest.py +++ b/scrapinglib/httprequest.py @@ -9,7 +9,7 @@ from cloudscraper import create_scraper G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' G_DEFAULT_TIMEOUT = 10 -def get(url: str, cookies = None, ua: str = None, return_type: str = None, encoding: str = None, +def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 网页请求核心函数 @@ -18,7 +18,8 @@ def get(url: str, cookies = None, ua: str = None, return_type: str = None, encod """ errors = "" headers = {"User-Agent": ua or G_USER_AGENT} - + if extra_headers != None: + headers.update(extra_headers) for i in range(retry): try: result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies, diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index d5a4c3e..2646c72 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -29,10 +29,8 @@ class Javbus(Parser): expr_tags = '/html/head/meta[@name="keywords"]/@content' expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]' - def search(self, number, core: None): - + def search(self, number): self.number = number - self.updateCore(core) try: url = "https://www." + secrets.choice([ 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', @@ -139,7 +137,9 @@ class Javbus(Parser): return '' def getOutline(self, htmltree): - if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): - return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 - from .storyline import getStoryline - return getStoryline(self.number , uncensored = self.uncensored) + if self.morestoryline: + if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'): + return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度 + from .storyline import getStoryline + return getStoryline(self.number , uncensored = self.uncensored) + return '' diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index ca49469..a5ad159 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -43,15 +43,22 @@ class Javdb(Parser): def updateCore(self, core): if core.proxies: self.proxies = core.proxies + if core.verify: + self.verify = core.verify + if core.morestoryline: + self.morestoryline = True + # special if core.dbcookies: self.cookies = core.dbcookies else: self.cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} + if core.dbsite: + self.dbsite = core.dbsite + else: + self.dbsite = 'javdb' - def search(self, number, core: None): + def search(self, number): self.number = number - self.updateCore(core) - self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) self.detailurl = self.queryNumberUrl(number) @@ -61,7 +68,7 @@ class Javdb(Parser): return result def queryNumberUrl(self, number): - javdb_url = 'https://javdb.com/search?q=' + number + '&f=all' + javdb_url = 'https://' + self.dbsite + '.com/search?q=' + number + '&f=all' try: resp = self.session.get(javdb_url) except Exception as e: @@ -148,8 +155,10 @@ class Javdb(Parser): return r def getOutline(self, htmltree): - from .storyline import getStoryline - return getStoryline(self.number, self.getUncensored(htmltree)) + if self.morestoryline: + from .storyline import getStoryline + return getStoryline(self.number, self.getUncensored(htmltree)) + return '' def getStudio(self, htmltree): try: diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index d5dda47..d300b4a 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -15,10 +15,8 @@ class Madou(Parser): expr_studio = '//a[@rel="category tag"]/text()' expr_tags = '/html/head/meta[@name="keywords"]/@content' - def search(self, number, core: None): + def search(self, number): self.number = number.lower().strip() - self.updateCore(core) - self.detailurl = "https://madou.club/" + number + ".html" self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: diff --git a/scrapinglib/mgstage.py b/scrapinglib/mgstage.py index 8662f59..c4c229e 100644 --- a/scrapinglib/mgstage.py +++ b/scrapinglib/mgstage.py @@ -23,10 +23,8 @@ class Mgstage(Parser): expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()' expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()' - def search(self, number, core: None): + def search(self, number): self.number = number.upper() - self.updateCore(core) - self.cookies = {'adc':'1'} self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' self.htmlcode = self.getHtml(self.detailurl) diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index 5ded60c..c27665c 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -11,10 +11,13 @@ class Parser: source = 'base' imagecut = 1 uncensored = False + allow_number_change = False # update proxies = None - cookies = None verify = None + extraheader = None + cookies = None + morestoryline = False number = '' detailurl = '' @@ -47,12 +50,15 @@ class Parser: def __init__(self) -> None: pass - def search(self, number, core: None): - """ 搜索番号 + def scrape(self, number, core: None): + """ 刮削番号 """ - self.number = number self.updateCore(core) + result = self.search(number) + return result + def search(self, number): + self.number = number self.detailurl = self.queryNumberUrl(number) htmltree = self.getHtmlTree(self.detailurl) result = self.dictformat(htmltree) @@ -66,6 +72,10 @@ class Parser: """ if core.proxies: self.proxies = core.proxies + if core.verify: + self.verify = core.verify + if core.morestoryline: + self.morestoryline = True def queryNumberUrl(self, number): """ 根据番号查询详细信息url @@ -78,7 +88,7 @@ class Parser: def getHtml(self, url, type = None): """ 访问网页 """ - resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, verify=self.verify, return_type=type) + resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, verify=self.verify, return_type=type) if '404 Page Not Found' in resp \ or '<title>未找到页面' in resp \ or '404 Not Found' in resp \ diff --git a/scrapinglib/xcity.py b/scrapinglib/xcity.py index d990628..d787e54 100644 --- a/scrapinglib/xcity.py +++ b/scrapinglib/xcity.py @@ -59,8 +59,10 @@ class Xcity(Parser): return '' def getOutline(self, htmltree): - from .storyline import getStoryline - return getStoryline(self.number, uncensored=False) + if self.morestoryline: + from .storyline import getStoryline + return getStoryline(self.number, uncensored=False) + return '' def getActors(self, htmltree): htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') @@ -111,10 +113,8 @@ class Xcity(Parser): raise ValueError("xcity.py: detail page not found") return str(browser.page), browser - def search(self, number, core: None): + def search(self, number): self.number = number - self.updateCore(core) - self.detail_page, self.browser = self.open_by_browser(number) self.detailurl = self.browser.url lx = etree.fromstring(self.detail_page, etree.HTMLParser())