From 153cdcde00d36d7a201cd8e3a936671f32e2e81f Mon Sep 17 00:00:00 2001 From: Mathhew Date: Tue, 16 Aug 2022 09:24:16 +0800 Subject: [PATCH] fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 优化avsox刮削 FC2 - 修复javdb与library的specifiedUrl - 其他 --- scrapinglib/avsox.py | 15 +++++++++++---- scrapinglib/dlsite.py | 1 - scrapinglib/javdb.py | 2 ++ scrapinglib/javlibrary.py | 2 ++ scrapinglib/mgstage.py | 1 - scrapinglib/parser.py | 25 ++++++++++++++++++------- scrapinglib/storyline.py | 4 ++-- 7 files changed, 35 insertions(+), 15 deletions(-) diff --git a/scrapinglib/avsox.py b/scrapinglib/avsox.py index 9c324a6..7788e13 100644 --- a/scrapinglib/avsox.py +++ b/scrapinglib/avsox.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import re from .parser import Parser @@ -8,6 +7,7 @@ class Avsox(Parser): source = 'avsox' imagecut = 3 + originalnum = '' expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()' expr_actor = '//a[@class="avatar-box"]' @@ -21,7 +21,11 @@ class Avsox(Parser): expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()' expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()' - def queryNumberUrl(self, number): + def queryNumberUrl(self, number: str): + upnum = number.upper() + if 'FC2' in upnum and 'FC2-PPV' not in upnum: + number = upnum.replace('FC2', 'FC2-PPV') + self.number = number qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox') site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href') self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) @@ -38,11 +42,14 @@ class Avsox(Parser): new_number = self.getTreeElement(htmltree, self.expr_number) if new_number.upper() != self.number.upper(): raise ValueError('number not found in ' + self.source) + self.originalnum = new_number + if 'FC2-PPV' in new_number.upper(): + new_number = new_number.upper().replace('FC2-PPV', 'FC2') self.number = new_number - return new_number + return self.number def getTitle(self, htmltree): - return super().getTitle(htmltree).replace('/', '').strip(self.number) + return super().getTitle(htmltree).replace('/', '').strip(self.originalnum).strip() def getStudio(self, htmltree): return super().getStudio(htmltree).replace("', '", ' ') diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index 6edd854..c8415d4 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- import re -from lxml import etree from .parser import Parser diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py index c21a819..0285de8 100644 --- a/scrapinglib/javdb.py +++ b/scrapinglib/javdb.py @@ -51,6 +51,8 @@ class Javdb(Parser): self.verify = core.verify if core.morestoryline: self.morestoryline = True + if core.specifiedSource == self.source: + self.specifiedUrl = core.specifiedUrl # special if core.dbcookies: self.cookies = core.dbcookies diff --git a/scrapinglib/javlibrary.py b/scrapinglib/javlibrary.py index b2c7d19..61bc127 100644 --- a/scrapinglib/javlibrary.py +++ b/scrapinglib/javlibrary.py @@ -29,6 +29,8 @@ class Javlibrary(Parser): self.verify = core.verify if core.morestoryline: self.morestoryline = True + if core.specifiedSource == self.source: + self.specifiedUrl = core.specifiedUrl self.cookies = {'over18':'1'} def search(self, number): diff --git a/scrapinglib/mgstage.py b/scrapinglib/mgstage.py index de279fc..0f4d759 100644 --- a/scrapinglib/mgstage.py +++ b/scrapinglib/mgstage.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -from .utils import getTreeElement from .parser import Parser diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index 90670ff..0c3f3c6 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -228,17 +228,28 @@ class Parser: def getActorPhoto(self, htmltree) -> dict: return {} - def getUncensored(self, htmlree) -> bool: - if self.expr_uncensored: - u = self.getTreeAll(htmlree, self.expr_uncensored) - return bool(u) - else: + def getUncensored(self, htmltree) -> bool: + """ + tag: 無码 無修正 uncensored 无码 + title: 無碼 無修正 uncensored + """ + if self.uncensored: return self.uncensored + tags = [x.lower() for x in self.getTags(htmltree) if len(x)] + title = self.getTitle(htmltree) + if self.expr_uncensored: + u = self.getTreeAll(htmltree, self.expr_uncensored) + self.uncensored = bool(u) + elif '無码' in tags or '無修正' in tags or 'uncensored' in tags or '无码' in tags: + self.uncensored = True + elif '無码' in title or '無修正' in title or 'uncensored' in title.lower(): + self.uncensored = True + return self.uncensored - def getImagecut(self, htmlree): + def getImagecut(self, htmltree): """ 修正 无码poster不裁剪cover """ - if self.imagecut == 1 and self.getUncensored(htmlree): + if self.imagecut == 1 and self.getUncensored(htmltree): self.imagecut = 0 return self.imagecut diff --git a/scrapinglib/storyline.py b/scrapinglib/storyline.py index 306789a..65c346c 100644 --- a/scrapinglib/storyline.py +++ b/scrapinglib/storyline.py @@ -15,8 +15,8 @@ from urllib.parse import urljoin from lxml.html import fromstring from multiprocessing.dummy import Pool as ThreadPool -from scrapinglib.airav import Airav -from scrapinglib.xcity import Xcity +from .airav import Airav +from .xcity import Xcity from .httprequest import get_html_by_form, get_html_by_scraper, request_session # 舍弃 Amazon 源