Merge pull request #858 from Suwmlee/master

fixes
This commit is contained in:
Yoshiko2
2022-08-25 23:28:51 +08:00
committed by GitHub
14 changed files with 112 additions and 72 deletions

View File

@@ -8,9 +8,6 @@ from .javbus import Javbus
class Airav(Parser):
source = 'airav'
# for javbus
specifiedSource = None
addtion_Javbus = True
expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()'
@@ -22,6 +19,11 @@ class Airav(Parser):
expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
def extraInit(self):
# for javbus
self.specifiedSource = None
self.addtion_Javbus = True
def search(self, number):
self.number = number
if self.specifiedUrl:

View File

@@ -1,13 +1,10 @@
# -*- coding: utf-8 -*-
import re
from .parser import Parser
class Avsox(Parser):
source = 'avsox'
imagecut = 3
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
expr_actor = '//a[@class="avatar-box"]'
@@ -21,7 +18,15 @@ class Avsox(Parser):
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
def queryNumberUrl(self, number):
def extraInit(self):
self.imagecut = 3
self.originalnum = ''
def queryNumberUrl(self, number: str):
upnum = number.upper()
if 'FC2' in upnum and 'FC2-PPV' not in upnum:
number = upnum.replace('FC2', 'FC2-PPV')
self.number = number
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
@@ -38,11 +43,14 @@ class Avsox(Parser):
new_number = self.getTreeElement(htmltree, self.expr_number)
if new_number.upper() != self.number.upper():
raise ValueError('number not found in ' + self.source)
self.originalnum = new_number
if 'FC2-PPV' in new_number.upper():
new_number = new_number.upper().replace('FC2-PPV', 'FC2')
self.number = new_number
return new_number
return self.number
def getTitle(self, htmltree):
return super().getTitle(htmltree).replace('/', '').strip(self.number)
return super().getTitle(htmltree).replace('/', '').strip(self.originalnum).strip()
def getStudio(self, htmltree):
return super().getStudio(htmltree).replace("', '", ' ')

View File

@@ -8,7 +8,6 @@ from .parser import Parser
class Carib(Parser):
source = 'carib'
uncensored = True
expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()"
expr_release = "//li[2]/span[@class='spec-content']/text()"
@@ -20,6 +19,9 @@ class Carib(Parser):
expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()"
def extraInit(self):
self.uncensored = True
def search(self, number):
self.number = number
if self.specifiedUrl:

View File

@@ -1,14 +1,11 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Dlsite(Parser):
source = 'dlsite'
imagecut = 4
allow_number_change = True
expr_title = '/html/head/title/text()'
expr_actor = '//th[contains(text(),"声优")]/../td/a/text()'
@@ -27,6 +24,10 @@ class Dlsite(Parser):
expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src'
def extraInit(self):
self.imagecut = 4
self.allow_number_change = True
def search(self, number):
self.cookies = {'locale': 'zh-cn'}
if self.specifiedUrl:

View File

@@ -9,7 +9,6 @@ from .parser import Parser
class Fc2(Parser):
source = 'fc2'
imagecut = 0
expr_title = '/html/head/title/text()'
expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
@@ -21,6 +20,9 @@ class Fc2(Parser):
expr_extrafanart = '//ul[@class="items_article_SampleImagesArea"]/li/a/@href'
expr_tags = "//a[@class='tag tagTag']/text()"
def extraInit(self):
self.imagecut = 0
def search(self, number):
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
if self.specifiedUrl:

View File

@@ -8,7 +8,6 @@ from .parser import Parser
class Gcolle(Parser):
source = 'gcolle'
imagecut = 4
expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href'
expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()'
@@ -25,8 +24,11 @@ class Gcolle(Parser):
expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src'
expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
def search(self, number):
self.number = number.upper().replace('GCOLLE-','')
def extraInit(self):
self.imagecut = 4
def search(self, number: str):
self.number = number.upper().replace('GCOLLE-', '')
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
@@ -69,5 +71,3 @@ class Gcolle(Parser):
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
return extrafanart

View File

@@ -28,12 +28,6 @@ class Getchu():
return dic
class wwwGetchu(Parser):
imagecut = 0
allow_number_change = True
cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()'
expr_cover = '//head/meta[@property="og:image"]/@content'
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
@@ -46,6 +40,13 @@ class wwwGetchu(Parser):
expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"
expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
def extraInit(self):
self.imagecut = 0
self.allow_number_change = True
self.cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
self.GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
def queryNumberUrl(self, number):
if 'GETCHU' in number.upper():
idn = re.findall('\d+',number)[0]
@@ -96,16 +97,6 @@ class dlGetchu(wwwGetchu):
""" 二者基本一致
headers extrafanart 略有区别
"""
imagecut = 4
allow_number_change = True
cookies = {"adult_check_flag": "1"}
extraheader = {"Referer": "https://dl.getchu.com/"}
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"
expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()"
expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
@@ -117,6 +108,16 @@ class dlGetchu(wwwGetchu):
expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href"
expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
def extraInit(self):
self.imagecut = 4
self.allow_number_change = True
self.cookies = {"adult_check_flag": "1"}
self.extraheader = {"Referer": "https://dl.getchu.com/"}
self.GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
self.GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
def queryNumberUrl(self, number):
if "item" in number or 'GETCHU' in number.upper():
self.number = re.findall('\d+',number)[0]

View File

@@ -11,9 +11,6 @@ from .parser import Parser
class Javdb(Parser):
source = 'javdb'
fixstudio = False
noauth = False
expr_number = '//strong[contains(text(),"番號")]/../span/text()'
expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()'
expr_title = "/html/head/title/text()"
@@ -44,6 +41,10 @@ class Javdb(Parser):
expr_uservotes = '//span[@class="score-stars"]/../text()'
expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]'
def extraInit(self):
self.fixstudio = False
self.noauth = False
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
@@ -51,6 +52,8 @@ class Javdb(Parser):
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
if core.specifiedSource == self.source:
self.specifiedUrl = core.specifiedUrl
# special
if core.dbcookies:
self.cookies = core.dbcookies

View File

@@ -8,8 +8,6 @@ from .parser import Parser
class Javlibrary(Parser):
source = 'javlibrary'
htmltree = None
expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()'
expr_title = '//div[@id="video_title"]/h3/a/text()'
expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()'
@@ -22,6 +20,9 @@ class Javlibrary(Parser):
expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()'
expr_extrafanart = '//div[@class="previewthumbs"]/img/@src'
def extraInit(self):
self.htmltree = None
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
@@ -29,6 +30,8 @@ class Javlibrary(Parser):
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
if core.specifiedSource == self.source:
self.specifiedUrl = core.specifiedUrl
self.cookies = {'over18':'1'}
def search(self, number):

View File

@@ -8,14 +8,16 @@ from .parser import Parser
class Madou(Parser):
source = 'madou'
imagecut = 0
uncensored = True
expr_url = '//a[@class="share-weixin"]/@data-url'
expr_title = "/html/head/title/text()"
expr_studio = '//a[@rel="category tag"]/text()'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
def extraInit(self):
self.imagecut = 0
self.uncensored = True
def search(self, number):
self.number = number.lower().strip()
if self.specifiedUrl:

View File

@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
from .utils import getTreeElement
from .parser import Parser

View File

@@ -8,8 +8,6 @@ from .parser import Parser
class Mv91(Parser):
source = 'mv91'
imagecut = 0
uncensored = True
expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()'
@@ -18,6 +16,10 @@ class Mv91(Parser):
expr_tags = '//div[@class="player-tag"]/text()'
expr_actor = '//p[@class="player-name"]/text()'
def extraInit(self):
self.imagecut = 0
self.uncensored = True
def getHtmlTree(self, url, type=None):
self.htmlcode = self.getHtml(url, type)
if self.htmlcode == 404:

View File

@@ -11,23 +11,6 @@ class Parser:
""" 基础刮削类
"""
source = 'base'
# 推荐剪切poster封面:
# `0` 复制cover
# `1` 裁剪cover
# `3` 下载小封面
imagecut = 1
uncensored = False
allow_number_change = False
# update
proxies = None
verify = None
extraheader = None
cookies = None
morestoryline = False
specifiedUrl = None
number = ''
detailurl = ''
# xpath expr
expr_number = ''
expr_title = ''
@@ -54,12 +37,33 @@ class Parser:
expr_userrating = ''
expr_uservotes = ''
def __init__(self) -> None:
def __init__(self):
# 推荐剪切poster封面:
# `0` 复制cover
# `1` 裁剪cover
# `3` 下载小封面
self.imagecut = 1
self.uncensored = False
self.allow_number_change = False
# update
self.proxies = None
self.verify = None
self.extraheader = None
self.cookies = None
self.morestoryline = False
self.specifiedUrl = None
self.extraInit()
def extraInit(self):
""" 自定义初始化内容
"""
pass
def scrape(self, number, core: None):
""" 刮削番号
"""
# 每次调用,初始化参数
self.__init__()
self.updateCore(core)
result = self.search(number)
return result
@@ -228,17 +232,28 @@ class Parser:
def getActorPhoto(self, htmltree) -> dict:
return {}
def getUncensored(self, htmlree) -> bool:
if self.expr_uncensored:
u = self.getTreeAll(htmlree, self.expr_uncensored)
return bool(u)
else:
def getUncensored(self, htmltree) -> bool:
"""
tag: 無码 無修正 uncensored 无码
title: 無碼 無修正 uncensored
"""
if self.uncensored:
return self.uncensored
tags = [x.lower() for x in self.getTags(htmltree) if len(x)]
title = self.getTitle(htmltree)
if self.expr_uncensored:
u = self.getTreeAll(htmltree, self.expr_uncensored)
self.uncensored = bool(u)
elif '無码' in tags or '無修正' in tags or 'uncensored' in tags or '无码' in tags:
self.uncensored = True
elif '無码' in title or '無修正' in title or 'uncensored' in title.lower():
self.uncensored = True
return self.uncensored
def getImagecut(self, htmlree):
def getImagecut(self, htmltree):
""" 修正 无码poster不裁剪cover
"""
if self.imagecut == 1 and self.getUncensored(htmlree):
if self.imagecut == 1 and self.getUncensored(htmltree):
self.imagecut = 0
return self.imagecut

View File

@@ -15,8 +15,8 @@ from urllib.parse import urljoin
from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool
from scrapinglib.airav import Airav
from scrapinglib.xcity import Xcity
from .airav import Airav
from .xcity import Xcity
from .httprequest import get_html_by_form, get_html_by_scraper, request_session
# 舍弃 Amazon 源