Merge pull request #858 from Suwmlee/master

fixes
This commit is contained in:
Yoshiko2
2022-08-25 23:28:51 +08:00
committed by GitHub
14 changed files with 112 additions and 72 deletions

View File

@@ -8,9 +8,6 @@ from .javbus import Javbus
class Airav(Parser): class Airav(Parser):
source = 'airav' source = 'airav'
# for javbus
specifiedSource = None
addtion_Javbus = True
expr_title = '/html/head/title/text()' expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()' expr_number = '/html/head/title/text()'
@@ -22,6 +19,11 @@ class Airav(Parser):
expr_tags = '//div[@class="tagBtnMargin"]/a/text()' expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href' expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
def extraInit(self):
# for javbus
self.specifiedSource = None
self.addtion_Javbus = True
def search(self, number): def search(self, number):
self.number = number self.number = number
if self.specifiedUrl: if self.specifiedUrl:

View File

@@ -1,13 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
from .parser import Parser from .parser import Parser
class Avsox(Parser): class Avsox(Parser):
source = 'avsox' source = 'avsox'
imagecut = 3
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()' expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
expr_actor = '//a[@class="avatar-box"]' expr_actor = '//a[@class="avatar-box"]'
@@ -21,7 +18,15 @@ class Avsox(Parser):
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()' expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()' expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
def queryNumberUrl(self, number): def extraInit(self):
self.imagecut = 3
self.originalnum = ''
def queryNumberUrl(self, number: str):
upnum = number.upper()
if 'FC2' in upnum and 'FC2-PPV' not in upnum:
number = upnum.replace('FC2', 'FC2-PPV')
self.number = number
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox') qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href') site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number) self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
@@ -38,11 +43,14 @@ class Avsox(Parser):
new_number = self.getTreeElement(htmltree, self.expr_number) new_number = self.getTreeElement(htmltree, self.expr_number)
if new_number.upper() != self.number.upper(): if new_number.upper() != self.number.upper():
raise ValueError('number not found in ' + self.source) raise ValueError('number not found in ' + self.source)
self.originalnum = new_number
if 'FC2-PPV' in new_number.upper():
new_number = new_number.upper().replace('FC2-PPV', 'FC2')
self.number = new_number self.number = new_number
return new_number return self.number
def getTitle(self, htmltree): def getTitle(self, htmltree):
return super().getTitle(htmltree).replace('/', '').strip(self.number) return super().getTitle(htmltree).replace('/', '').strip(self.originalnum).strip()
def getStudio(self, htmltree): def getStudio(self, htmltree):
return super().getStudio(htmltree).replace("', '", ' ') return super().getStudio(htmltree).replace("', '", ' ')

View File

@@ -8,7 +8,6 @@ from .parser import Parser
class Carib(Parser): class Carib(Parser):
source = 'carib' source = 'carib'
uncensored = True
expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()" expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()"
expr_release = "//li[2]/span[@class='spec-content']/text()" expr_release = "//li[2]/span[@class='spec-content']/text()"
@@ -20,6 +19,9 @@ class Carib(Parser):
expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()" expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()" expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()"
def extraInit(self):
self.uncensored = True
def search(self, number): def search(self, number):
self.number = number self.number = number
if self.specifiedUrl: if self.specifiedUrl:

View File

@@ -1,14 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
from lxml import etree
from .parser import Parser from .parser import Parser
class Dlsite(Parser): class Dlsite(Parser):
source = 'dlsite' source = 'dlsite'
imagecut = 4
allow_number_change = True
expr_title = '/html/head/title/text()' expr_title = '/html/head/title/text()'
expr_actor = '//th[contains(text(),"声优")]/../td/a/text()' expr_actor = '//th[contains(text(),"声优")]/../td/a/text()'
@@ -27,6 +24,10 @@ class Dlsite(Parser):
expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()' expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src' expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src'
def extraInit(self):
self.imagecut = 4
self.allow_number_change = True
def search(self, number): def search(self, number):
self.cookies = {'locale': 'zh-cn'} self.cookies = {'locale': 'zh-cn'}
if self.specifiedUrl: if self.specifiedUrl:

View File

@@ -9,7 +9,6 @@ from .parser import Parser
class Fc2(Parser): class Fc2(Parser):
source = 'fc2' source = 'fc2'
imagecut = 0
expr_title = '/html/head/title/text()' expr_title = '/html/head/title/text()'
expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()' expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
@@ -21,6 +20,9 @@ class Fc2(Parser):
expr_extrafanart = '//ul[@class="items_article_SampleImagesArea"]/li/a/@href' expr_extrafanart = '//ul[@class="items_article_SampleImagesArea"]/li/a/@href'
expr_tags = "//a[@class='tag tagTag']/text()" expr_tags = "//a[@class='tag tagTag']/text()"
def extraInit(self):
self.imagecut = 0
def search(self, number): def search(self, number):
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '') self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
if self.specifiedUrl: if self.specifiedUrl:

View File

@@ -8,7 +8,6 @@ from .parser import Parser
class Gcolle(Parser): class Gcolle(Parser):
source = 'gcolle' source = 'gcolle'
imagecut = 4
expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href' expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href'
expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()' expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()'
@@ -25,8 +24,11 @@ class Gcolle(Parser):
expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src' expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src'
expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src' expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
def search(self, number): def extraInit(self):
self.number = number.upper().replace('GCOLLE-','') self.imagecut = 4
def search(self, number: str):
self.number = number.upper().replace('GCOLLE-', '')
if self.specifiedUrl: if self.specifiedUrl:
self.detailurl = self.specifiedUrl self.detailurl = self.specifiedUrl
else: else:
@@ -69,5 +71,3 @@ class Gcolle(Parser):
for i in range(len(extrafanart)): for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i] extrafanart[i] = 'https:' + extrafanart[i]
return extrafanart return extrafanart

View File

@@ -28,12 +28,6 @@ class Getchu():
return dic return dic
class wwwGetchu(Parser): class wwwGetchu(Parser):
imagecut = 0
allow_number_change = True
cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()' expr_title = '//*[@id="soft-title"]/text()'
expr_cover = '//head/meta[@property="og:image"]/@content' expr_cover = '//head/meta[@property="og:image"]/@content'
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()" expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
@@ -46,6 +40,13 @@ class wwwGetchu(Parser):
expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href" expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"
expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()" expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
def extraInit(self):
self.imagecut = 0
self.allow_number_change = True
self.cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
self.GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
def queryNumberUrl(self, number): def queryNumberUrl(self, number):
if 'GETCHU' in number.upper(): if 'GETCHU' in number.upper():
idn = re.findall('\d+',number)[0] idn = re.findall('\d+',number)[0]
@@ -96,16 +97,6 @@ class dlGetchu(wwwGetchu):
""" 二者基本一致 """ 二者基本一致
headers extrafanart 略有区别 headers extrafanart 略有区别
""" """
imagecut = 4
allow_number_change = True
cookies = {"adult_check_flag": "1"}
extraheader = {"Referer": "https://dl.getchu.com/"}
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()" expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"
expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()" expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()"
expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
@@ -117,6 +108,16 @@ class dlGetchu(wwwGetchu):
expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href" expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href"
expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()" expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
def extraInit(self):
self.imagecut = 4
self.allow_number_change = True
self.cookies = {"adult_check_flag": "1"}
self.extraheader = {"Referer": "https://dl.getchu.com/"}
self.GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
self.GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
def queryNumberUrl(self, number): def queryNumberUrl(self, number):
if "item" in number or 'GETCHU' in number.upper(): if "item" in number or 'GETCHU' in number.upper():
self.number = re.findall('\d+',number)[0] self.number = re.findall('\d+',number)[0]

View File

@@ -11,9 +11,6 @@ from .parser import Parser
class Javdb(Parser): class Javdb(Parser):
source = 'javdb' source = 'javdb'
fixstudio = False
noauth = False
expr_number = '//strong[contains(text(),"番號")]/../span/text()' expr_number = '//strong[contains(text(),"番號")]/../span/text()'
expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()' expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()'
expr_title = "/html/head/title/text()" expr_title = "/html/head/title/text()"
@@ -44,6 +41,10 @@ class Javdb(Parser):
expr_uservotes = '//span[@class="score-stars"]/../text()' expr_uservotes = '//span[@class="score-stars"]/../text()'
expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]' expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]'
def extraInit(self):
self.fixstudio = False
self.noauth = False
def updateCore(self, core): def updateCore(self, core):
if core.proxies: if core.proxies:
self.proxies = core.proxies self.proxies = core.proxies
@@ -51,6 +52,8 @@ class Javdb(Parser):
self.verify = core.verify self.verify = core.verify
if core.morestoryline: if core.morestoryline:
self.morestoryline = True self.morestoryline = True
if core.specifiedSource == self.source:
self.specifiedUrl = core.specifiedUrl
# special # special
if core.dbcookies: if core.dbcookies:
self.cookies = core.dbcookies self.cookies = core.dbcookies

View File

@@ -8,8 +8,6 @@ from .parser import Parser
class Javlibrary(Parser): class Javlibrary(Parser):
source = 'javlibrary' source = 'javlibrary'
htmltree = None
expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()' expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()'
expr_title = '//div[@id="video_title"]/h3/a/text()' expr_title = '//div[@id="video_title"]/h3/a/text()'
expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()' expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()'
@@ -22,6 +20,9 @@ class Javlibrary(Parser):
expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()' expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()'
expr_extrafanart = '//div[@class="previewthumbs"]/img/@src' expr_extrafanart = '//div[@class="previewthumbs"]/img/@src'
def extraInit(self):
self.htmltree = None
def updateCore(self, core): def updateCore(self, core):
if core.proxies: if core.proxies:
self.proxies = core.proxies self.proxies = core.proxies
@@ -29,6 +30,8 @@ class Javlibrary(Parser):
self.verify = core.verify self.verify = core.verify
if core.morestoryline: if core.morestoryline:
self.morestoryline = True self.morestoryline = True
if core.specifiedSource == self.source:
self.specifiedUrl = core.specifiedUrl
self.cookies = {'over18':'1'} self.cookies = {'over18':'1'}
def search(self, number): def search(self, number):

View File

@@ -8,14 +8,16 @@ from .parser import Parser
class Madou(Parser): class Madou(Parser):
source = 'madou' source = 'madou'
imagecut = 0
uncensored = True
expr_url = '//a[@class="share-weixin"]/@data-url' expr_url = '//a[@class="share-weixin"]/@data-url'
expr_title = "/html/head/title/text()" expr_title = "/html/head/title/text()"
expr_studio = '//a[@rel="category tag"]/text()' expr_studio = '//a[@rel="category tag"]/text()'
expr_tags = '/html/head/meta[@name="keywords"]/@content' expr_tags = '/html/head/meta[@name="keywords"]/@content'
def extraInit(self):
self.imagecut = 0
self.uncensored = True
def search(self, number): def search(self, number):
self.number = number.lower().strip() self.number = number.lower().strip()
if self.specifiedUrl: if self.specifiedUrl:

View File

@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .utils import getTreeElement
from .parser import Parser from .parser import Parser

View File

@@ -8,8 +8,6 @@ from .parser import Parser
class Mv91(Parser): class Mv91(Parser):
source = 'mv91' source = 'mv91'
imagecut = 0
uncensored = True
expr_number = '//div[@class="player-title"]/text()' expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()' expr_title = '//div[@class="player-title"]/text()'
@@ -18,6 +16,10 @@ class Mv91(Parser):
expr_tags = '//div[@class="player-tag"]/text()' expr_tags = '//div[@class="player-tag"]/text()'
expr_actor = '//p[@class="player-name"]/text()' expr_actor = '//p[@class="player-name"]/text()'
def extraInit(self):
self.imagecut = 0
self.uncensored = True
def getHtmlTree(self, url, type=None): def getHtmlTree(self, url, type=None):
self.htmlcode = self.getHtml(url, type) self.htmlcode = self.getHtml(url, type)
if self.htmlcode == 404: if self.htmlcode == 404:

View File

@@ -11,23 +11,6 @@ class Parser:
""" 基础刮削类 """ 基础刮削类
""" """
source = 'base' source = 'base'
# 推荐剪切poster封面:
# `0` 复制cover
# `1` 裁剪cover
# `3` 下载小封面
imagecut = 1
uncensored = False
allow_number_change = False
# update
proxies = None
verify = None
extraheader = None
cookies = None
morestoryline = False
specifiedUrl = None
number = ''
detailurl = ''
# xpath expr # xpath expr
expr_number = '' expr_number = ''
expr_title = '' expr_title = ''
@@ -54,12 +37,33 @@ class Parser:
expr_userrating = '' expr_userrating = ''
expr_uservotes = '' expr_uservotes = ''
def __init__(self) -> None: def __init__(self):
# 推荐剪切poster封面:
# `0` 复制cover
# `1` 裁剪cover
# `3` 下载小封面
self.imagecut = 1
self.uncensored = False
self.allow_number_change = False
# update
self.proxies = None
self.verify = None
self.extraheader = None
self.cookies = None
self.morestoryline = False
self.specifiedUrl = None
self.extraInit()
def extraInit(self):
""" 自定义初始化内容
"""
pass pass
def scrape(self, number, core: None): def scrape(self, number, core: None):
""" 刮削番号 """ 刮削番号
""" """
# 每次调用,初始化参数
self.__init__()
self.updateCore(core) self.updateCore(core)
result = self.search(number) result = self.search(number)
return result return result
@@ -228,17 +232,28 @@ class Parser:
def getActorPhoto(self, htmltree) -> dict: def getActorPhoto(self, htmltree) -> dict:
return {} return {}
def getUncensored(self, htmlree) -> bool: def getUncensored(self, htmltree) -> bool:
if self.expr_uncensored: """
u = self.getTreeAll(htmlree, self.expr_uncensored) tag: 無码 無修正 uncensored 无码
return bool(u) title: 無碼 無修正 uncensored
else: """
if self.uncensored:
return self.uncensored return self.uncensored
tags = [x.lower() for x in self.getTags(htmltree) if len(x)]
title = self.getTitle(htmltree)
if self.expr_uncensored:
u = self.getTreeAll(htmltree, self.expr_uncensored)
self.uncensored = bool(u)
elif '無码' in tags or '無修正' in tags or 'uncensored' in tags or '无码' in tags:
self.uncensored = True
elif '無码' in title or '無修正' in title or 'uncensored' in title.lower():
self.uncensored = True
return self.uncensored
def getImagecut(self, htmlree): def getImagecut(self, htmltree):
""" 修正 无码poster不裁剪cover """ 修正 无码poster不裁剪cover
""" """
if self.imagecut == 1 and self.getUncensored(htmlree): if self.imagecut == 1 and self.getUncensored(htmltree):
self.imagecut = 0 self.imagecut = 0
return self.imagecut return self.imagecut

View File

@@ -15,8 +15,8 @@ from urllib.parse import urljoin
from lxml.html import fromstring from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.dummy import Pool as ThreadPool
from scrapinglib.airav import Airav from .airav import Airav
from scrapinglib.xcity import Xcity from .xcity import Xcity
from .httprequest import get_html_by_form, get_html_by_scraper, request_session from .httprequest import get_html_by_form, get_html_by_scraper, request_session
# 舍弃 Amazon 源 # 舍弃 Amazon 源