diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py index 9f2e761..ee27a25 100644 --- a/scrapinglib/__init__.py +++ b/scrapinglib/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -from .api import search +from .api import search, getSupportedSources diff --git a/scrapinglib/airav.py b/scrapinglib/airav.py index c22384d..f0e2e39 100644 --- a/scrapinglib/airav.py +++ b/scrapinglib/airav.py @@ -8,6 +8,9 @@ from .javbus import Javbus class Airav(Parser): source = 'airav' + # for javbus + specifiedSource = None + addtion_Javbus = True expr_title = '/html/head/title/text()' expr_number = '/html/head/title/text()' @@ -21,23 +24,38 @@ class Airav(Parser): def search(self, number): self.number = number - self.detailurl = 'https://cn.airav.wiki/video/' + number - engine = Javbus() - javbusinfo = engine.scrape(number, self) - if javbusinfo == 404: - self.javbus = {"title": ""} + if self.specifiedUrl: + self.detailurl = self.specifiedUrl else: - self.javbus = json.loads(javbusinfo) + self.detailurl = self.queryNumberUrl(self.number) + if self.addtion_Javbus: + engine = Javbus() + javbusinfo = engine.scrape(self.number, self) + if javbusinfo == 404: + self.javbus = {"title": ""} + else: + self.javbus = json.loads(javbusinfo) self.htmlcode = self.getHtml(self.detailurl) htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser()) result = self.dictformat(htmltree) return result + def queryNumberUrl(self, number): + queryUrl = "https://cn.airav.wiki/?search=" + number + queryTree = self.getHtmlTree(queryUrl) + results = self.getTreeAll(queryTree, '//div[contains(@class,"videoList")]/div/a') + for i in results: + num = self.getTreeElement(i, '//div/div[contains(@class,"videoNumber")]/p[1]/text()') + if num.replace('-','') == number.replace('-','').upper(): + self.number = num + return "https://cn.airav.wiki" + i.attrib['href'] + return 'https://cn.airav.wiki/video/' + number + def getNum(self, htmltree): - # return super().getNum(htmltree) - result = self.javbus.get('number') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('number') + if isinstance(result, str) and len(result): + return result number = super().getNum(htmltree) result = str(re.findall('^\[(.*?)]', number)[0]) return result @@ -48,24 +66,27 @@ class Airav(Parser): return result def getStudio(self, htmltree): - result = self.javbus.get('studio') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('studio') + if isinstance(result, str) and len(result): + return result return super().getStudio(htmltree) def getRelease(self, htmltree): - result = self.javbus.get('release') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('release') + if isinstance(result, str) and len(result): + return result try: return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group() except: return '' def getYear(self, htmltree): - result = self.javbus.get('year') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('year') + if isinstance(result, str) and len(result): + return result release = self.getRelease(htmltree) return str(re.findall('\d{4}', release)).strip(" ['']") @@ -73,39 +94,40 @@ class Airav(Parser): return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip() def getRuntime(self, htmltree): - result = self.javbus.get('runtime') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('runtime') + if isinstance(result, str) and len(result): + return result return '' def getDirector(self, htmltree): - result = self.javbus.get('director') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('director') + if isinstance(result, str) and len(result): + return result return '' def getActors(self, htmltree): - b=[] a = super().getActors(htmltree) - for v in a: - v = v.strip() - if len(v): - b.append(v) + b = [ i.strip() for i in a if len(i)] if len(b): return b - result = self.javbus.get('actor') - if isinstance(result, list) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('actor') + if isinstance(result, list) and len(result): + return result return [] def getCover(self, htmltree): - result = self.javbus.get('cover') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('cover') + if isinstance(result, str) and len(result): + return result return super().getCover(htmltree) def getSeries(self, htmltree): - result = self.javbus.get('series') - if isinstance(result, str) and len(result): - return result + if self.addtion_Javbus: + result = self.javbus.get('series') + if isinstance(result, str) and len(result): + return result return '' diff --git a/scrapinglib/api.py b/scrapinglib/api.py index c8c4679..4314dee 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -18,29 +18,45 @@ from .mgstage import Mgstage from .javbus import Javbus from .xcity import Xcity from .avsox import Avsox +from .javlibrary import Javlibrary from .tmdb import Tmdb +from .imdb import Imdb def search(number, sources: str=None, proxies=None, verify=None, type='adult', + specifiedSource=None, specifiedUrl=None, dbcookies=None, dbsite=None, morestoryline=False): - """ 根据``番号/电影``名搜索信息 + """ 根据`番号/电影`名搜索信息 :param number: number/name depends on type - :param sources: sources string with `,` like ``avsox,javbus`` - :param type: ``adult``, ``general`` + :param sources: sources string with `,` Eg: `avsox,javbus` + :param type: `adult`, `general` """ sc = Scraping() return sc.search(number, sources, proxies=proxies, verify=verify, type=type, + specifiedSource=specifiedSource, specifiedUrl=specifiedUrl, dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline) + +def getSupportedSources(tag='adult'): + """ + :param tag: `adult`, `general` + """ + sc = Scraping() + if tag == 'adult': + return ','.join(sc.adult_full_sources) + else: + return ','.join(sc.general_full_sources) + + class Scraping(): """ """ - - adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', - 'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91', - 'gcolle', 'javdb', 'getchu'] + adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', + 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91', + 'getchu', 'gcolle' + ] adult_func_mapping = { 'avsox': Avsox().scrape, 'javbus': Javbus().scrape, @@ -57,15 +73,19 @@ class Scraping(): 'gcolle': Gcolle().scrape, 'javdb': Javdb().scrape, 'getchu': Getchu().scrape, + 'javlibrary': Javlibrary().scrape, } - general_full_sources = ['tmdb'] + general_full_sources = ['tmdb','imdb'] general_func_mapping = { 'tmdb': Tmdb().scrape, + 'imdb': Imdb().scrape, } proxies = None verify = None + specifiedSource = None + specifiedUrl = None dbcookies = None dbsite = None @@ -73,9 +93,12 @@ class Scraping(): morestoryline = False def search(self, number, sources=None, proxies=None, verify=None, type='adult', + specifiedSource=None, specifiedUrl=None, dbcookies=None, dbsite=None, morestoryline=False): self.proxies = proxies self.verify = verify + self.specifiedSource = specifiedSource + self.specifiedUrl = specifiedUrl self.dbcookies = dbcookies self.dbsite = dbsite self.morestoryline = morestoryline @@ -88,7 +111,10 @@ class Scraping(): """ 查询电影电视剧 imdb,tmdb """ - sources = self.checkGeneralSources(sources, name) + if self.specifiedSource: + sources = [self.specifiedSource] + else: + sources = self.checkGeneralSources(sources, name) json_data = {} for source in sources: try: @@ -116,7 +142,10 @@ class Scraping(): return json_data def searchAdult(self, number, sources): - sources = self.checkAdultSources(sources, number) + if self.specifiedSource: + sources = [self.specifiedSource] + else: + sources = self.checkAdultSources(sources, number) json_data = {} for source in sources: try: diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index c41cb6e..9cb5213 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -50,10 +50,14 @@ class Avsox(Parser): def getSmallCover(self, htmltree): """ 使用搜索页面的预览小图 """ - return self.getTreeElement(self.searchtree, self.expr_smallcover) + try: + return self.getTreeElement(self.searchtree, self.expr_smallcover) + except: + self.imagecut = 1 + return '' def getTags(self, htmltree): - tags = self.getTreeElement(htmltree).split(',') + tags = self.getTreeElement(htmltree, self.expr_tags).split(',') return [i.strip() for i in tags[2:]] if len(tags) > 2 else [] def getOutline(self, htmltree): diff --git a/scrapinglib/carib.py b/scrapinglib/carib.py index 3d9f094..decaba6 100644 --- a/scrapinglib/carib.py +++ b/scrapinglib/carib.py @@ -22,7 +22,10 @@ class Carib(Parser): def search(self, number): self.number = number - self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html' htmlcode = self.getHtml(self.detailurl) if htmlcode == 404 or 'class="movie-info section"' not in htmlcode: return 404 diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index cc701c9..6edd854 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -29,7 +29,12 @@ class Dlsite(Parser): def search(self, number): self.cookies = {'locale': 'zh-cn'} - if "RJ" in number or "VJ" in number: + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + # TODO 应该从页面内获取 number + self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']") + htmltree = self.getHtmlTree(self.detailurl) + elif "RJ" in number or "VJ" in number: self.number = number.upper() self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN' htmltree = self.getHtmlTree(self.detailurl) diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py index f6cc01c..2706b91 100644 --- a/scrapinglib/fanza.py +++ b/scrapinglib/fanza.py @@ -11,15 +11,21 @@ class Fanza(Parser): expr_title = '//*[starts-with(@id, "title")]/text()' expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()" - expr_cover = '//head/meta[@property="og:image"]' + expr_cover = './/head/meta[@property="og:image"]/@content' expr_extrafanart = '//a[@name="sample-image"]/img/@src' expr_outline = "//div[@class='mg-b20 lh4']/text()" expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()" - expr_outline_og = '//head/meta[@property="og:description"]' + expr_outline_og = '//head/meta[@property="og:description"]/@content' expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()" def search(self, number): self.number = number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + durl = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl}) + self.htmltree = self.getHtmlTree(durl) + result = self.dictformat(self.htmltree) + return result # fanza allow letter + number + underscore, normalize the input here # @note: I only find the usage of underscore as h_test123456789 fanza_search_number = number @@ -75,7 +81,7 @@ class Fanza(Parser): if result == '': result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "") if "※ 配信方法によって収録内容が異なる場合があります。" == result: - result = self.getTreeElement(htmltree, self.expr_outline_og).get('content') + result = self.getTreeElement(htmltree, self.expr_outline_og) return result except: return '' @@ -99,9 +105,6 @@ class Fanza(Parser): result = self.getFanzaString('配信開始日:') return result.replace("/", "-").strip('\\n') - def getCover(self, htmltree): - return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content') - def getTags(self, htmltree): return self.getFanzaStrings('ジャンル:') diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index 13640ed..6707682 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -22,8 +22,11 @@ class Fc2(Parser): expr_tags = "//a[@class='tag tagTag']/text()" def search(self, number): - self.number = number.replace('FC2-', '').replace('fc2-', '') - self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' + self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '') + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/' self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: return 404 diff --git a/scrapinglib/gcolle.py b/scrapinglib/gcolle.py index 100e3ef..c6d7027 100644 --- a/scrapinglib/gcolle.py +++ b/scrapinglib/gcolle.py @@ -2,7 +2,7 @@ import re from lxml import etree -from .httprequest import get_html_session +from .httprequest import request_session from .parser import Parser @@ -27,9 +27,12 @@ class Gcolle(Parser): def search(self, number): self.number = number.upper().replace('GCOLLE-','') - self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number - session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) - htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + self.number).text + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number + session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + htmlcode = session.get(self.detailurl).text htmltree = etree.HTML(htmlcode) r18url = self.getTreeElement(htmltree, self.expr_r18) diff --git a/scrapinglib/getchu.py b/scrapinglib/getchu.py index eec16ec..1372ba8 100644 --- a/scrapinglib/getchu.py +++ b/scrapinglib/getchu.py @@ -35,7 +35,7 @@ class wwwGetchu(Parser): GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit=' expr_title = '//*[@id="soft-title"]/text()' - expr_cover = '//head/meta[@property="og:image"]' + expr_cover = '//head/meta[@property="og:image"]/@content' expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" @@ -67,9 +67,6 @@ class wwwGetchu(Parser): def getNum(self, htmltree): return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0] - def getCover(self, htmltree): - return self.getTreeElement(htmltree, self.expr_cover).get('content') - def getActors(self, htmltree): return super().getDirector(htmltree) diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py index 997ff39..7e99819 100644 --- a/scrapinglib/httprequest.py +++ b/scrapinglib/httprequest.py @@ -9,8 +9,9 @@ from cloudscraper import create_scraper G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36' G_DEFAULT_TIMEOUT = 10 -def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): + +def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None, + retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 网页请求核心函数 @@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: raise Exception('Connect Failed') -def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None, - retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): +def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None, + retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): """ 是否使用代理应由上层处理 """ @@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_ raise Exception('Connect Failed') -# -# TODO: 以下临时使用,更新完各站后,再更新 -# - - class TimeoutHTTPAdapter(HTTPAdapter): def __init__(self, *args, **kwargs): self.timeout = G_DEFAULT_TIMEOUT @@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter): return super().send(request, **kwargs) -# with keep-alive feature -# storyline carib gcolle javdb only -def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None, - encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None): +def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None): + """ + keep-alive + """ session = requests.Session() retries = Retry(total=retry, connect=retry, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) @@ -110,27 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ if proxies: session.proxies = proxies session.headers = {"User-Agent": ua or G_USER_AGENT} - try: - if isinstance(url, str) and len(url): - result = session.get(str(url)) - else: # 空url参数直接返回可重用session对象,无需设置return_type - return session - if not result.ok: - return None - if return_type == "object": - return result - elif return_type == "content": - return result.content - elif return_type == "session": - return result, session - else: - result.encoding = encoding or "utf-8" - return result.text - except requests.exceptions.ProxyError: - print("[-]get_html_session() Proxy error! Please check your Proxy") - except Exception as e: - print(f"[-]get_html_session() failed. {e}") - return None + return session + # storyline only # 使用 cloudscraper.... diff --git a/scrapinglib/imdb.py b/scrapinglib/imdb.py new file mode 100644 index 0000000..7aab483 --- /dev/null +++ b/scrapinglib/imdb.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + + +from .parser import Parser + + +class Imdb(Parser): + source = 'imdb' + imagecut = 0 + + expr_title = '//h1[@data-testid="hero-title-block__title"]/text()' + expr_release = '//a[contains(text(),"Release date")]/following-sibling::div[1]/ul/li/a/text()' + expr_cover = '//head/meta[@property="og:image"]/@content' + expr_outline = '//head/meta[@property="og:description"]/@content' + expr_actor = '//h3[contains(text(),"Top cast")]/../../../following-sibling::div[1]/div[2]/div/div/a/text()' + expr_tags = '//div[@data-testid="genres"]/div[2]/a/ul/li/text()' + + def queryNumberUrl(self, number): + """ + TODO 区分 ID 与 名称 + """ + id = number + movieUrl = "https://www.imdb.com/title/" + id + return movieUrl diff --git a/scrapinglib/jav321.py b/scrapinglib/jav321.py index 31126f4..babdee2 100644 --- a/scrapinglib/jav321.py +++ b/scrapinglib/jav321.py @@ -26,6 +26,14 @@ class Jav321(Parser): return 'https://www.jav321.com/search' def getHtmlTree(self, url): + """ + 特殊处理 仅获取页面调用一次 + """ + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + resp = httprequest.get(self.detailurl, cookies=self.cookies, proxies=self.proxies, verify=self.verify) + self.detailhtml = resp + return etree.fromstring(resp, etree.HTMLParser()) resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify) if "/video/" in resp.url: self.detailurl = resp.url diff --git a/scrapinglib/javbus.py b/scrapinglib/javbus.py index bb6e978..8e52de1 100644 --- a/scrapinglib/javbus.py +++ b/scrapinglib/javbus.py @@ -32,7 +32,12 @@ class Javbus(Parser): def search(self, number): self.number = number - try: + try: + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + htmltree = self.getHtmlTree(self.detailurl) + result = self.dictformat(htmltree) + return result url = "https://www." + secrets.choice([ 'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun', 'cdnbus.fun', @@ -61,7 +66,10 @@ class Javbus(Parser): self.uncensored = True w_number = number.replace('.', '-') - self.detailurl = 'https://www.javbus.red/' + w_number + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://www.javbus.red/' + w_number self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: return 404 diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index dae5092..3cacd05 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -4,7 +4,7 @@ import re from urllib.parse import urljoin from lxml import etree -from .httprequest import get_html_session +from .httprequest import request_session from .parser import Parser @@ -63,8 +63,11 @@ class Javdb(Parser): def search(self, number: str): self.number = number - self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) - self.detailurl = self.queryNumberUrl(number) + self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(number) self.deatilpage = self.session.get(self.detailurl).text if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage: self.noauth = True @@ -193,19 +196,19 @@ class Javdb(Parser): def getUserRating(self, htmltree): try: - result = str(self.getTreeElement(htmltree, self.expr_userrating)) - v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) - return float(v[0][0]) + numstrs = self.getTreeElement(htmltree, self.expr_userrating) + nums = re.findall('[0-9.]+', numstrs) + return float(nums[0]) except: - return + return '' def getUserVotes(self, htmltree): try: - result = str(self.getTreeElement(htmltree, self.expr_uservotes)) - v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result) - return int(v[0][1]) + result = self.getTreeElement(htmltree, self.expr_uservotes) + v = re.findall('[0-9.]+', result) + return int(v[1]) except: - return + return '' def getaphoto(self, url, session): html_page = session.get(url).text diff --git a/scrapinglib/javlibrary.py b/scrapinglib/javlibrary.py new file mode 100644 index 0000000..782fa22 --- /dev/null +++ b/scrapinglib/javlibrary.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +from lxml import etree +from .httprequest import request_session +from .parser import Parser + + +class Javlibrary(Parser): + source = 'javlibrary' + + htmltree = None + + expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()' + expr_title = '//div[@id="video_title"]/h3/a/text()' + expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()' + expr_tags = '//div[@id="video_genres"]/table/tr/td[@class="text"]/span/a/text()' + expr_cover = '//img[@id="video_jacket_img"]/@src' + expr_release = '//div[@id="video_date"]/table/tr/td[@class="text"]/text()' + expr_studio = '//div[@id="video_maker"]/table/tr/td[@class="text"]/span/a/text()' + expr_runtime = '//div[@id="video_length"]/table/tr/td/span[@class="text"]/text()' + expr_userrating = '//div[@id="video_review"]/table/tr/td/span[@class="score"]/text()' + expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()' + expr_extrafanart = '//div[@class="previewthumbs"]/img/@src' + + def updateCore(self, core): + if core.proxies: + self.proxies = core.proxies + if core.verify: + self.verify = core.verify + if core.morestoryline: + self.morestoryline = True + self.cookies = {'over18':'1'} + + def search(self, number): + self.number = number.upper() + self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(self.number) + if not self.detailurl: + return 404 + if self.htmltree is None: + deatils = self.session.get(self.detailurl) + self.htmltree = etree.fromstring(deatils.text, etree.HTMLParser()) + result = self.dictformat(self.htmltree) + return result + + def queryNumberUrl(self, number:str): + queryUrl = "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + number + queryResult = self.session.get(queryUrl) + + if queryResult and "/?v=jav" in queryResult.url: + self.htmltree = etree.fromstring(queryResult.text, etree.HTMLParser()) + return queryResult.url + else: + queryTree = etree.fromstring(queryResult.text, etree.HTMLParser()) + numbers = queryTree.xpath('//div[@class="id"]/text()') + if number in numbers: + urls = queryTree.xpath('//div[@class="id"]/../@href') + detailurl = urls[numbers.index(number)] + return "http://www.javlibrary.com/cn" + detailurl.strip('.') + return None + + def getTitle(self, htmltree): + title = super().getTitle(htmltree) + title = title.replace(self.getNum(htmltree), '').strip() + return title + + def getCover(self, htmltree): + url = super().getCover(htmltree) + if not url.startswith('http'): + url = 'https:' + url + return url + + def getOutline(self, htmltree): + if self.morestoryline: + from .storyline import getStoryline + return getStoryline(self.number, self.getUncensored(htmltree)) + return '' diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index f132c22..f3ffd20 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -8,6 +8,7 @@ from .parser import Parser class Madou(Parser): source = 'madou' + imagecut = 0 uncensored = True expr_url = '//a[@class="share-weixin"]/@data-url' @@ -17,7 +18,10 @@ class Madou(Parser): def search(self, number): self.number = number.lower().strip() - self.detailurl = "https://madou.club/" + number + ".html" + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = "https://madou.club/" + number + ".html" self.htmlcode = self.getHtml(self.detailurl) if self.htmlcode == 404: return 404 diff --git a/scrapinglib/mgstage.py b/scrapinglib/mgstage.py index a09540b..de279fc 100644 --- a/scrapinglib/mgstage.py +++ b/scrapinglib/mgstage.py @@ -25,7 +25,10 @@ class Mgstage(Parser): def search(self, number): self.number = number.upper() self.cookies = {'adc':'1'} - self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/' htmltree =self.getHtmlTree(self.detailurl) result = self.dictformat(htmltree) return result diff --git a/scrapinglib/mv91.py b/scrapinglib/mv91.py index cf10144..7d589b1 100644 --- a/scrapinglib/mv91.py +++ b/scrapinglib/mv91.py @@ -8,6 +8,8 @@ from .parser import Parser class Mv91(Parser): source = 'mv91' + imagecut = 0 + uncensored = True expr_number = '//div[@class="player-title"]/text()' expr_title = '//div[@class="player-title"]/text()' @@ -53,8 +55,8 @@ class Mv91(Parser): result = str(finds[0][0]) else: result = ' '.join(title.replace('/',' ').split()) - result = result.split()[0].replace('「预告」','') - return result.strip() + result = result.split()[0] + return result.replace('「预告」','').strip('/ ') except: return '' diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index a861c88..fa25d4e 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -11,7 +11,10 @@ class Parser: """ 基础刮削类 """ source = 'base' - # poster: `0` 复制 `1` 裁剪 + # 推荐剪切poster封面: + # `0` 复制cover + # `1` 裁剪cover + # `3` 下载小封面 imagecut = 1 uncensored = False allow_number_change = False @@ -21,6 +24,7 @@ class Parser: extraheader = None cookies = None morestoryline = False + specifiedUrl = None number = '' detailurl = '' @@ -61,8 +65,19 @@ class Parser: return result def search(self, number): + """ 查询番号 + + 查询主要流程: + 1. 获取 url + 2. 获取详情页面 + 3. 解析 + 4. 返回 result + """ self.number = number - self.detailurl = self.queryNumberUrl(number) + if self.specifiedUrl: + self.detailurl = self.specifiedUrl + else: + self.detailurl = self.queryNumberUrl(number) htmltree = self.getHtmlTree(self.detailurl) result = self.dictformat(htmltree) return result @@ -79,13 +94,16 @@ class Parser: self.verify = core.verify if core.morestoryline: self.morestoryline = True + if core.specifiedSource == self.source: + self.specifiedUrl = core.specifiedUrl def queryNumberUrl(self, number): """ 根据番号查询详细信息url + 需要针对不同站点修改,或者在上层直接获取 备份查询页面,预览图可能需要 """ - url = httprequest.get(number) + url = "http://detailurl.ai/" + number return url def getHtml(self, url, type = None): @@ -115,26 +133,26 @@ class Parser: 'number': self.getNum(htmltree), 'title': self.getTitle(htmltree), 'studio': self.getStudio(htmltree), + 'release': self.getRelease(htmltree), 'year': self.getYear(htmltree), 'outline': self.getOutline(htmltree), 'runtime': self.getRuntime(htmltree), 'director': self.getDirector(htmltree), 'actor': self.getActors(htmltree), - 'release': self.getRelease(htmltree), + 'actor_photo': self.getActorPhoto(htmltree), 'cover': self.getCover(htmltree), 'cover_small': self.getSmallCover(htmltree), 'extrafanart': self.getExtrafanart(htmltree), 'trailer': self.getTrailer(htmltree), - 'imagecut': self.imagecut, 'tag': self.getTags(htmltree), 'label': self.getLabel(htmltree), - 'actor_photo': self.getActorPhoto(htmltree), + 'series': self.getSeries(htmltree), + 'userrating': self.getUserRating(htmltree), + 'uservotes': self.getUserVotes(htmltree), + 'uncensored': self.getUncensored(htmltree), 'website': self.detailurl, 'source': self.source, - 'series': self.getSeries(htmltree), - 'uncensored': self.getUncensored(htmltree), - 'userrating': self.getUserRating(htmltree), - 'uservotes': self.getUserVotes(htmltree) + 'imagecut': self.getImagecut(htmltree), } dic = self.extradict(dic) except Exception as e: @@ -215,11 +233,26 @@ class Parser: else: return self.uncensored + def getImagecut(self, htmlree): + """ 修正 无码poster不裁剪cover + """ + if self.imagecut == 1 and self.getUncensored(htmlree): + self.imagecut = 0 + return self.imagecut + def getUserRating(self, htmltree): - return self.getTreeElement(htmltree, self.expr_userrating) + numstrs = self.getTreeElement(htmltree, self.expr_userrating) + nums = re.findall('[0-9.]+', numstrs) + if len(nums) == 1: + return float(nums[0]) + return '' def getUserVotes(self, htmltree): - return self.getTreeElement(htmltree, self.expr_uservotes) + votestrs = self.getTreeElement(htmltree, self.expr_uservotes) + votes = re.findall('[0-9]+', votestrs) + if len(votes) == 1: + return int(votes[0]) + return '' def getTreeElement(self, tree: html.HtmlElement, expr, index=0): """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0 diff --git a/scrapinglib/storyline.py b/scrapinglib/storyline.py index 458388f..2194c76 100644 --- a/scrapinglib/storyline.py +++ b/scrapinglib/storyline.py @@ -13,7 +13,7 @@ import builtins from urllib.parse import urljoin from lxml.html import fromstring from multiprocessing.dummy import Pool as ThreadPool -from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session +from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session # 舍弃 Amazon 源 G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"} @@ -112,7 +112,8 @@ def getStoryline_airav(number, debug): try: site = secrets.choice(('airav.cc','airav4.club')) url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' - res, session = get_html_session(url, return_type='session') + session = request_session() + res = session.get(url) if not res: raise ValueError(f"get_html_by_session('{url}') failed") lx = fromstring(res.text) diff --git a/scrapinglib/tmdb.py b/scrapinglib/tmdb.py index 1b8e02b..0856b79 100644 --- a/scrapinglib/tmdb.py +++ b/scrapinglib/tmdb.py @@ -13,10 +13,10 @@ class Tmdb(Parser): imagecut = 0 apikey = None - expr_title = '//head/meta[@property="og:title"]' + expr_title = '//head/meta[@property="og:title"]/@content' expr_release = '//div/span[@class="release"]/text()' - expr_cover = '//head/meta[@property="og:image"]' - expr_outline = '//head/meta[@property="og:description"]' + expr_cover = '//head/meta[@property="og:image"]/@content' + expr_outline = '//head/meta[@property="og:description"]/@content' # def search(self, number): # self.detailurl = self.queryNumberUrl(number) @@ -30,11 +30,6 @@ class Tmdb(Parser): movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN" return movieUrl - def getTitle(self, htmltree): - return self.getTreeElement(htmltree, self.expr_title).get('content') - def getCover(self, htmltree): - return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content') + return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover) - def getOutline(self, htmltree): - return self.getTreeElement(htmltree, self.expr_outline).get('content') diff --git a/scrapinglib/xcity.py b/scrapinglib/xcity.py index 2be48f6..36230bb 100644 --- a/scrapinglib/xcity.py +++ b/scrapinglib/xcity.py @@ -13,6 +13,9 @@ class Xcity(Parser): expr_number = '//*[@id="hinban"]/text()' expr_title = '//*[@id="program_detail_title"]/text()' + expr_actor = '//ul/li[@class="credit-links"]/a/text()' + expr_actor_link = '//ul/li[@class="credit-links"]/a' + expr_actorphoto = '//div[@class="frame"]/div/p/img/@src' expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()' expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()' expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()' @@ -23,6 +26,7 @@ class Xcity(Parser): expr_director = '//*[@id="program_detail_director"]/text()' expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()" expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()" + expr_extrafanart = '//div[@id="sample_images"]/div/a/@href' def getStudio(self, htmltree): return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '') @@ -57,41 +61,29 @@ class Xcity(Parser): return getStoryline(self.number, uncensored=False) return '' - def getActors(self, htmltree): - htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') - t = [] - for i in htmla: - t.append(i.text.strip()) - return t - def getActorPhoto(self, htmltree): - htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a') - t = {i.text.strip(): i['href'] for i in htmla} + treea = self.getTreeAll(htmltree, self.expr_actor_link) + t = {i.text.strip(): i.attrib['href'] for i in treea} o = {} for k, v in t.items(): - r = self.browser.open_relative(v) - if not r.ok: - continue - pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img') - if 'noimage.gif' in pic['src']: - continue - o[k] = urljoin(self.browser.url, pic['src']) + actorpageUrl = "https://xcity.jp" + v + try: + adtree = self.getHtmlTree(actorpageUrl) + picUrl = self.getTreeElement(adtree, self.expr_actorphoto) + if 'noimage.gif' in picUrl: + continue + o[k] = urljoin("https://xcity.jp", picUrl) + except: + pass return o def getExtrafanart(self, htmltree): - html_pather = re.compile(r'