update scrapinglib
- support specifiedUrl when scraping single movie - support javlibrary and rating
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from .api import search
|
from .api import search, getSupportedSources
|
||||||
|
|||||||
@@ -8,6 +8,9 @@ from .javbus import Javbus
|
|||||||
|
|
||||||
class Airav(Parser):
|
class Airav(Parser):
|
||||||
source = 'airav'
|
source = 'airav'
|
||||||
|
# for javbus
|
||||||
|
specifiedSource = None
|
||||||
|
addtion_Javbus = True
|
||||||
|
|
||||||
expr_title = '/html/head/title/text()'
|
expr_title = '/html/head/title/text()'
|
||||||
expr_number = '/html/head/title/text()'
|
expr_number = '/html/head/title/text()'
|
||||||
@@ -21,23 +24,38 @@ class Airav(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number
|
self.number = number
|
||||||
self.detailurl = 'https://cn.airav.wiki/video/' + number
|
if self.specifiedUrl:
|
||||||
engine = Javbus()
|
self.detailurl = self.specifiedUrl
|
||||||
javbusinfo = engine.scrape(number, self)
|
|
||||||
if javbusinfo == 404:
|
|
||||||
self.javbus = {"title": ""}
|
|
||||||
else:
|
else:
|
||||||
self.javbus = json.loads(javbusinfo)
|
self.detailurl = self.queryNumberUrl(self.number)
|
||||||
|
if self.addtion_Javbus:
|
||||||
|
engine = Javbus()
|
||||||
|
javbusinfo = engine.scrape(self.number, self)
|
||||||
|
if javbusinfo == 404:
|
||||||
|
self.javbus = {"title": ""}
|
||||||
|
else:
|
||||||
|
self.javbus = json.loads(javbusinfo)
|
||||||
self.htmlcode = self.getHtml(self.detailurl)
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||||
result = self.dictformat(htmltree)
|
result = self.dictformat(htmltree)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
queryUrl = "https://cn.airav.wiki/?search=" + number
|
||||||
|
queryTree = self.getHtmlTree(queryUrl)
|
||||||
|
results = self.getTreeAll(queryTree, '//div[contains(@class,"videoList")]/div/a')
|
||||||
|
for i in results:
|
||||||
|
num = self.getTreeElement(i, '//div/div[contains(@class,"videoNumber")]/p[1]/text()')
|
||||||
|
if num.replace('-','') == number.replace('-','').upper():
|
||||||
|
self.number = num
|
||||||
|
return "https://cn.airav.wiki" + i.attrib['href']
|
||||||
|
return 'https://cn.airav.wiki/video/' + number
|
||||||
|
|
||||||
def getNum(self, htmltree):
|
def getNum(self, htmltree):
|
||||||
# return super().getNum(htmltree)
|
if self.addtion_Javbus:
|
||||||
result = self.javbus.get('number')
|
result = self.javbus.get('number')
|
||||||
if isinstance(result, str) and len(result):
|
if isinstance(result, str) and len(result):
|
||||||
return result
|
return result
|
||||||
number = super().getNum(htmltree)
|
number = super().getNum(htmltree)
|
||||||
result = str(re.findall('^\[(.*?)]', number)[0])
|
result = str(re.findall('^\[(.*?)]', number)[0])
|
||||||
return result
|
return result
|
||||||
@@ -48,24 +66,27 @@ class Airav(Parser):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
result = self.javbus.get('studio')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('studio')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
return super().getStudio(htmltree)
|
return super().getStudio(htmltree)
|
||||||
|
|
||||||
def getRelease(self, htmltree):
|
def getRelease(self, htmltree):
|
||||||
result = self.javbus.get('release')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('release')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
try:
|
try:
|
||||||
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
|
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getYear(self, htmltree):
|
def getYear(self, htmltree):
|
||||||
result = self.javbus.get('year')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('year')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
release = self.getRelease(htmltree)
|
release = self.getRelease(htmltree)
|
||||||
return str(re.findall('\d{4}', release)).strip(" ['']")
|
return str(re.findall('\d{4}', release)).strip(" ['']")
|
||||||
|
|
||||||
@@ -73,39 +94,40 @@ class Airav(Parser):
|
|||||||
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
|
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
|
||||||
|
|
||||||
def getRuntime(self, htmltree):
|
def getRuntime(self, htmltree):
|
||||||
result = self.javbus.get('runtime')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('runtime')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getDirector(self, htmltree):
|
def getDirector(self, htmltree):
|
||||||
result = self.javbus.get('director')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('director')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getActors(self, htmltree):
|
def getActors(self, htmltree):
|
||||||
b=[]
|
|
||||||
a = super().getActors(htmltree)
|
a = super().getActors(htmltree)
|
||||||
for v in a:
|
b = [ i.strip() for i in a if len(i)]
|
||||||
v = v.strip()
|
|
||||||
if len(v):
|
|
||||||
b.append(v)
|
|
||||||
if len(b):
|
if len(b):
|
||||||
return b
|
return b
|
||||||
result = self.javbus.get('actor')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, list) and len(result):
|
result = self.javbus.get('actor')
|
||||||
return result
|
if isinstance(result, list) and len(result):
|
||||||
|
return result
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
result = self.javbus.get('cover')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('cover')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
return super().getCover(htmltree)
|
return super().getCover(htmltree)
|
||||||
|
|
||||||
def getSeries(self, htmltree):
|
def getSeries(self, htmltree):
|
||||||
result = self.javbus.get('series')
|
if self.addtion_Javbus:
|
||||||
if isinstance(result, str) and len(result):
|
result = self.javbus.get('series')
|
||||||
return result
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
return ''
|
return ''
|
||||||
|
|||||||
@@ -18,29 +18,45 @@ from .mgstage import Mgstage
|
|||||||
from .javbus import Javbus
|
from .javbus import Javbus
|
||||||
from .xcity import Xcity
|
from .xcity import Xcity
|
||||||
from .avsox import Avsox
|
from .avsox import Avsox
|
||||||
|
from .javlibrary import Javlibrary
|
||||||
|
|
||||||
from .tmdb import Tmdb
|
from .tmdb import Tmdb
|
||||||
|
from .imdb import Imdb
|
||||||
|
|
||||||
|
|
||||||
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
|
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
|
||||||
|
specifiedSource=None, specifiedUrl=None,
|
||||||
dbcookies=None, dbsite=None, morestoryline=False):
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
""" 根据``番号/电影``名搜索信息
|
""" 根据`番号/电影`名搜索信息
|
||||||
|
|
||||||
:param number: number/name depends on type
|
:param number: number/name depends on type
|
||||||
:param sources: sources string with `,` like ``avsox,javbus``
|
:param sources: sources string with `,` Eg: `avsox,javbus`
|
||||||
:param type: ``adult``, ``general``
|
:param type: `adult`, `general`
|
||||||
"""
|
"""
|
||||||
sc = Scraping()
|
sc = Scraping()
|
||||||
return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
|
return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
|
||||||
|
specifiedSource=specifiedSource, specifiedUrl=specifiedUrl,
|
||||||
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
|
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
|
||||||
|
|
||||||
|
|
||||||
|
def getSupportedSources(tag='adult'):
|
||||||
|
"""
|
||||||
|
:param tag: `adult`, `general`
|
||||||
|
"""
|
||||||
|
sc = Scraping()
|
||||||
|
if tag == 'adult':
|
||||||
|
return ','.join(sc.adult_full_sources)
|
||||||
|
else:
|
||||||
|
return ','.join(sc.general_full_sources)
|
||||||
|
|
||||||
|
|
||||||
class Scraping():
|
class Scraping():
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
|
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
|
||||||
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
|
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91',
|
||||||
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
|
'getchu', 'gcolle'
|
||||||
'gcolle', 'javdb', 'getchu']
|
]
|
||||||
adult_func_mapping = {
|
adult_func_mapping = {
|
||||||
'avsox': Avsox().scrape,
|
'avsox': Avsox().scrape,
|
||||||
'javbus': Javbus().scrape,
|
'javbus': Javbus().scrape,
|
||||||
@@ -57,15 +73,19 @@ class Scraping():
|
|||||||
'gcolle': Gcolle().scrape,
|
'gcolle': Gcolle().scrape,
|
||||||
'javdb': Javdb().scrape,
|
'javdb': Javdb().scrape,
|
||||||
'getchu': Getchu().scrape,
|
'getchu': Getchu().scrape,
|
||||||
|
'javlibrary': Javlibrary().scrape,
|
||||||
}
|
}
|
||||||
|
|
||||||
general_full_sources = ['tmdb']
|
general_full_sources = ['tmdb','imdb']
|
||||||
general_func_mapping = {
|
general_func_mapping = {
|
||||||
'tmdb': Tmdb().scrape,
|
'tmdb': Tmdb().scrape,
|
||||||
|
'imdb': Imdb().scrape,
|
||||||
}
|
}
|
||||||
|
|
||||||
proxies = None
|
proxies = None
|
||||||
verify = None
|
verify = None
|
||||||
|
specifiedSource = None
|
||||||
|
specifiedUrl = None
|
||||||
|
|
||||||
dbcookies = None
|
dbcookies = None
|
||||||
dbsite = None
|
dbsite = None
|
||||||
@@ -73,9 +93,12 @@ class Scraping():
|
|||||||
morestoryline = False
|
morestoryline = False
|
||||||
|
|
||||||
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
|
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
|
||||||
|
specifiedSource=None, specifiedUrl=None,
|
||||||
dbcookies=None, dbsite=None, morestoryline=False):
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
self.proxies = proxies
|
self.proxies = proxies
|
||||||
self.verify = verify
|
self.verify = verify
|
||||||
|
self.specifiedSource = specifiedSource
|
||||||
|
self.specifiedUrl = specifiedUrl
|
||||||
self.dbcookies = dbcookies
|
self.dbcookies = dbcookies
|
||||||
self.dbsite = dbsite
|
self.dbsite = dbsite
|
||||||
self.morestoryline = morestoryline
|
self.morestoryline = morestoryline
|
||||||
@@ -88,7 +111,10 @@ class Scraping():
|
|||||||
""" 查询电影电视剧
|
""" 查询电影电视剧
|
||||||
imdb,tmdb
|
imdb,tmdb
|
||||||
"""
|
"""
|
||||||
sources = self.checkGeneralSources(sources, name)
|
if self.specifiedSource:
|
||||||
|
sources = [self.specifiedSource]
|
||||||
|
else:
|
||||||
|
sources = self.checkGeneralSources(sources, name)
|
||||||
json_data = {}
|
json_data = {}
|
||||||
for source in sources:
|
for source in sources:
|
||||||
try:
|
try:
|
||||||
@@ -116,7 +142,10 @@ class Scraping():
|
|||||||
return json_data
|
return json_data
|
||||||
|
|
||||||
def searchAdult(self, number, sources):
|
def searchAdult(self, number, sources):
|
||||||
sources = self.checkAdultSources(sources, number)
|
if self.specifiedSource:
|
||||||
|
sources = [self.specifiedSource]
|
||||||
|
else:
|
||||||
|
sources = self.checkAdultSources(sources, number)
|
||||||
json_data = {}
|
json_data = {}
|
||||||
for source in sources:
|
for source in sources:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -50,10 +50,14 @@ class Avsox(Parser):
|
|||||||
def getSmallCover(self, htmltree):
|
def getSmallCover(self, htmltree):
|
||||||
""" 使用搜索页面的预览小图
|
""" 使用搜索页面的预览小图
|
||||||
"""
|
"""
|
||||||
return self.getTreeElement(self.searchtree, self.expr_smallcover)
|
try:
|
||||||
|
return self.getTreeElement(self.searchtree, self.expr_smallcover)
|
||||||
|
except:
|
||||||
|
self.imagecut = 1
|
||||||
|
return ''
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
tags = self.getTreeElement(htmltree).split(',')
|
tags = self.getTreeElement(htmltree, self.expr_tags).split(',')
|
||||||
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
|
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
def getOutline(self, htmltree):
|
||||||
|
|||||||
@@ -22,7 +22,10 @@ class Carib(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number
|
self.number = number
|
||||||
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
|
||||||
htmlcode = self.getHtml(self.detailurl)
|
htmlcode = self.getHtml(self.detailurl)
|
||||||
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:
|
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:
|
||||||
return 404
|
return 404
|
||||||
|
|||||||
@@ -29,7 +29,12 @@ class Dlsite(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.cookies = {'locale': 'zh-cn'}
|
self.cookies = {'locale': 'zh-cn'}
|
||||||
if "RJ" in number or "VJ" in number:
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
# TODO 应该从页面内获取 number
|
||||||
|
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
|
||||||
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
elif "RJ" in number or "VJ" in number:
|
||||||
self.number = number.upper()
|
self.number = number.upper()
|
||||||
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
|
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
|
||||||
htmltree = self.getHtmlTree(self.detailurl)
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
|||||||
@@ -11,15 +11,21 @@ class Fanza(Parser):
|
|||||||
|
|
||||||
expr_title = '//*[starts-with(@id, "title")]/text()'
|
expr_title = '//*[starts-with(@id, "title")]/text()'
|
||||||
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||||
expr_cover = '//head/meta[@property="og:image"]'
|
expr_cover = './/head/meta[@property="og:image"]/@content'
|
||||||
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
||||||
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
||||||
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
||||||
expr_outline_og = '//head/meta[@property="og:description"]'
|
expr_outline_og = '//head/meta[@property="og:description"]/@content'
|
||||||
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number
|
self.number = number
|
||||||
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
durl = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl})
|
||||||
|
self.htmltree = self.getHtmlTree(durl)
|
||||||
|
result = self.dictformat(self.htmltree)
|
||||||
|
return result
|
||||||
# fanza allow letter + number + underscore, normalize the input here
|
# fanza allow letter + number + underscore, normalize the input here
|
||||||
# @note: I only find the usage of underscore as h_test123456789
|
# @note: I only find the usage of underscore as h_test123456789
|
||||||
fanza_search_number = number
|
fanza_search_number = number
|
||||||
@@ -75,7 +81,7 @@ class Fanza(Parser):
|
|||||||
if result == '':
|
if result == '':
|
||||||
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
||||||
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
|
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
|
||||||
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
|
result = self.getTreeElement(htmltree, self.expr_outline_og)
|
||||||
return result
|
return result
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
@@ -99,9 +105,6 @@ class Fanza(Parser):
|
|||||||
result = self.getFanzaString('配信開始日:')
|
result = self.getFanzaString('配信開始日:')
|
||||||
return result.replace("/", "-").strip('\\n')
|
return result.replace("/", "-").strip('\\n')
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
|
||||||
return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content')
|
|
||||||
|
|
||||||
def getTags(self, htmltree):
|
def getTags(self, htmltree):
|
||||||
return self.getFanzaStrings('ジャンル:')
|
return self.getFanzaStrings('ジャンル:')
|
||||||
|
|
||||||
|
|||||||
@@ -22,8 +22,11 @@ class Fc2(Parser):
|
|||||||
expr_tags = "//a[@class='tag tagTag']/text()"
|
expr_tags = "//a[@class='tag tagTag']/text()"
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.replace('FC2-', '').replace('fc2-', '')
|
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
|
||||||
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
|
||||||
self.htmlcode = self.getHtml(self.detailurl)
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
if self.htmlcode == 404:
|
if self.htmlcode == 404:
|
||||||
return 404
|
return 404
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from .httprequest import get_html_session
|
from .httprequest import request_session
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
@@ -27,9 +27,12 @@ class Gcolle(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.upper().replace('GCOLLE-','')
|
self.number = number.upper().replace('GCOLLE-','')
|
||||||
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
|
if self.specifiedUrl:
|
||||||
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
self.detailurl = self.specifiedUrl
|
||||||
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + self.number).text
|
else:
|
||||||
|
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
|
||||||
|
session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
htmlcode = session.get(self.detailurl).text
|
||||||
htmltree = etree.HTML(htmlcode)
|
htmltree = etree.HTML(htmlcode)
|
||||||
|
|
||||||
r18url = self.getTreeElement(htmltree, self.expr_r18)
|
r18url = self.getTreeElement(htmltree, self.expr_r18)
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class wwwGetchu(Parser):
|
|||||||
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
|
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
|
||||||
|
|
||||||
expr_title = '//*[@id="soft-title"]/text()'
|
expr_title = '//*[@id="soft-title"]/text()'
|
||||||
expr_cover = '//head/meta[@property="og:image"]'
|
expr_cover = '//head/meta[@property="og:image"]/@content'
|
||||||
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
||||||
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
||||||
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
||||||
@@ -67,9 +67,6 @@ class wwwGetchu(Parser):
|
|||||||
def getNum(self, htmltree):
|
def getNum(self, htmltree):
|
||||||
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
|
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
|
||||||
return self.getTreeElement(htmltree, self.expr_cover).get('content')
|
|
||||||
|
|
||||||
def getActors(self, htmltree):
|
def getActors(self, htmltree):
|
||||||
return super().getDirector(htmltree)
|
return super().getDirector(htmltree)
|
||||||
|
|
||||||
|
|||||||
@@ -9,8 +9,9 @@ from cloudscraper import create_scraper
|
|||||||
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
|
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
|
||||||
G_DEFAULT_TIMEOUT = 10
|
G_DEFAULT_TIMEOUT = 10
|
||||||
|
|
||||||
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
|
|
||||||
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None,
|
||||||
|
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
"""
|
"""
|
||||||
网页请求核心函数
|
网页请求核心函数
|
||||||
|
|
||||||
@@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
|
|||||||
raise Exception('Connect Failed')
|
raise Exception('Connect Failed')
|
||||||
|
|
||||||
|
|
||||||
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
|
def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None,
|
||||||
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
"""
|
"""
|
||||||
是否使用代理应由上层处理
|
是否使用代理应由上层处理
|
||||||
"""
|
"""
|
||||||
@@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_
|
|||||||
raise Exception('Connect Failed')
|
raise Exception('Connect Failed')
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# TODO: 以下临时使用,更新完各站后,再更新
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
class TimeoutHTTPAdapter(HTTPAdapter):
|
class TimeoutHTTPAdapter(HTTPAdapter):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.timeout = G_DEFAULT_TIMEOUT
|
self.timeout = G_DEFAULT_TIMEOUT
|
||||||
@@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter):
|
|||||||
return super().send(request, **kwargs)
|
return super().send(request, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
# with keep-alive feature
|
def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
# storyline carib gcolle javdb only
|
"""
|
||||||
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
|
keep-alive
|
||||||
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
"""
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
||||||
status_forcelist=[429, 500, 502, 503, 504])
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
@@ -110,27 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ
|
|||||||
if proxies:
|
if proxies:
|
||||||
session.proxies = proxies
|
session.proxies = proxies
|
||||||
session.headers = {"User-Agent": ua or G_USER_AGENT}
|
session.headers = {"User-Agent": ua or G_USER_AGENT}
|
||||||
try:
|
return session
|
||||||
if isinstance(url, str) and len(url):
|
|
||||||
result = session.get(str(url))
|
|
||||||
else: # 空url参数直接返回可重用session对象,无需设置return_type
|
|
||||||
return session
|
|
||||||
if not result.ok:
|
|
||||||
return None
|
|
||||||
if return_type == "object":
|
|
||||||
return result
|
|
||||||
elif return_type == "content":
|
|
||||||
return result.content
|
|
||||||
elif return_type == "session":
|
|
||||||
return result, session
|
|
||||||
else:
|
|
||||||
result.encoding = encoding or "utf-8"
|
|
||||||
return result.text
|
|
||||||
except requests.exceptions.ProxyError:
|
|
||||||
print("[-]get_html_session() Proxy error! Please check your Proxy")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[-]get_html_session() failed. {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# storyline only
|
# storyline only
|
||||||
# 使用 cloudscraper....
|
# 使用 cloudscraper....
|
||||||
|
|||||||
24
scrapinglib/imdb.py
Normal file
24
scrapinglib/imdb.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Imdb(Parser):
|
||||||
|
source = 'imdb'
|
||||||
|
imagecut = 0
|
||||||
|
|
||||||
|
expr_title = '//h1[@data-testid="hero-title-block__title"]/text()'
|
||||||
|
expr_release = '//a[contains(text(),"Release date")]/following-sibling::div[1]/ul/li/a/text()'
|
||||||
|
expr_cover = '//head/meta[@property="og:image"]/@content'
|
||||||
|
expr_outline = '//head/meta[@property="og:description"]/@content'
|
||||||
|
expr_actor = '//h3[contains(text(),"Top cast")]/../../../following-sibling::div[1]/div[2]/div/div/a/text()'
|
||||||
|
expr_tags = '//div[@data-testid="genres"]/div[2]/a/ul/li/text()'
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
"""
|
||||||
|
TODO 区分 ID 与 名称
|
||||||
|
"""
|
||||||
|
id = number
|
||||||
|
movieUrl = "https://www.imdb.com/title/" + id
|
||||||
|
return movieUrl
|
||||||
@@ -26,6 +26,14 @@ class Jav321(Parser):
|
|||||||
return 'https://www.jav321.com/search'
|
return 'https://www.jav321.com/search'
|
||||||
|
|
||||||
def getHtmlTree(self, url):
|
def getHtmlTree(self, url):
|
||||||
|
"""
|
||||||
|
特殊处理 仅获取页面调用一次
|
||||||
|
"""
|
||||||
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
resp = httprequest.get(self.detailurl, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
self.detailhtml = resp
|
||||||
|
return etree.fromstring(resp, etree.HTMLParser())
|
||||||
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
if "/video/" in resp.url:
|
if "/video/" in resp.url:
|
||||||
self.detailurl = resp.url
|
self.detailurl = resp.url
|
||||||
|
|||||||
@@ -32,7 +32,12 @@ class Javbus(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number
|
self.number = number
|
||||||
try:
|
try:
|
||||||
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
url = "https://www." + secrets.choice([
|
url = "https://www." + secrets.choice([
|
||||||
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
|
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
|
||||||
'cdnbus.fun',
|
'cdnbus.fun',
|
||||||
@@ -61,7 +66,10 @@ class Javbus(Parser):
|
|||||||
self.uncensored = True
|
self.uncensored = True
|
||||||
|
|
||||||
w_number = number.replace('.', '-')
|
w_number = number.replace('.', '-')
|
||||||
self.detailurl = 'https://www.javbus.red/' + w_number
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = 'https://www.javbus.red/' + w_number
|
||||||
self.htmlcode = self.getHtml(self.detailurl)
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
if self.htmlcode == 404:
|
if self.htmlcode == 404:
|
||||||
return 404
|
return 404
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
import re
|
import re
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from .httprequest import get_html_session
|
from .httprequest import request_session
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
@@ -63,8 +63,11 @@ class Javdb(Parser):
|
|||||||
|
|
||||||
def search(self, number: str):
|
def search(self, number: str):
|
||||||
self.number = number
|
self.number = number
|
||||||
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
self.detailurl = self.queryNumberUrl(number)
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = self.queryNumberUrl(number)
|
||||||
self.deatilpage = self.session.get(self.detailurl).text
|
self.deatilpage = self.session.get(self.detailurl).text
|
||||||
if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
|
if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
|
||||||
self.noauth = True
|
self.noauth = True
|
||||||
@@ -193,19 +196,19 @@ class Javdb(Parser):
|
|||||||
|
|
||||||
def getUserRating(self, htmltree):
|
def getUserRating(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = str(self.getTreeElement(htmltree, self.expr_userrating))
|
numstrs = self.getTreeElement(htmltree, self.expr_userrating)
|
||||||
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
nums = re.findall('[0-9.]+', numstrs)
|
||||||
return float(v[0][0])
|
return float(nums[0])
|
||||||
except:
|
except:
|
||||||
return
|
return ''
|
||||||
|
|
||||||
def getUserVotes(self, htmltree):
|
def getUserVotes(self, htmltree):
|
||||||
try:
|
try:
|
||||||
result = str(self.getTreeElement(htmltree, self.expr_uservotes))
|
result = self.getTreeElement(htmltree, self.expr_uservotes)
|
||||||
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
v = re.findall('[0-9.]+', result)
|
||||||
return int(v[0][1])
|
return int(v[1])
|
||||||
except:
|
except:
|
||||||
return
|
return ''
|
||||||
|
|
||||||
def getaphoto(self, url, session):
|
def getaphoto(self, url, session):
|
||||||
html_page = session.get(url).text
|
html_page = session.get(url).text
|
||||||
|
|||||||
80
scrapinglib/javlibrary.py
Normal file
80
scrapinglib/javlibrary.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
from .httprequest import request_session
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Javlibrary(Parser):
|
||||||
|
source = 'javlibrary'
|
||||||
|
|
||||||
|
htmltree = None
|
||||||
|
|
||||||
|
expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()'
|
||||||
|
expr_title = '//div[@id="video_title"]/h3/a/text()'
|
||||||
|
expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()'
|
||||||
|
expr_tags = '//div[@id="video_genres"]/table/tr/td[@class="text"]/span/a/text()'
|
||||||
|
expr_cover = '//img[@id="video_jacket_img"]/@src'
|
||||||
|
expr_release = '//div[@id="video_date"]/table/tr/td[@class="text"]/text()'
|
||||||
|
expr_studio = '//div[@id="video_maker"]/table/tr/td[@class="text"]/span/a/text()'
|
||||||
|
expr_runtime = '//div[@id="video_length"]/table/tr/td/span[@class="text"]/text()'
|
||||||
|
expr_userrating = '//div[@id="video_review"]/table/tr/td/span[@class="score"]/text()'
|
||||||
|
expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()'
|
||||||
|
expr_extrafanart = '//div[@class="previewthumbs"]/img/@src'
|
||||||
|
|
||||||
|
def updateCore(self, core):
|
||||||
|
if core.proxies:
|
||||||
|
self.proxies = core.proxies
|
||||||
|
if core.verify:
|
||||||
|
self.verify = core.verify
|
||||||
|
if core.morestoryline:
|
||||||
|
self.morestoryline = True
|
||||||
|
self.cookies = {'over18':'1'}
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number.upper()
|
||||||
|
self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = self.queryNumberUrl(self.number)
|
||||||
|
if not self.detailurl:
|
||||||
|
return 404
|
||||||
|
if self.htmltree is None:
|
||||||
|
deatils = self.session.get(self.detailurl)
|
||||||
|
self.htmltree = etree.fromstring(deatils.text, etree.HTMLParser())
|
||||||
|
result = self.dictformat(self.htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number:str):
|
||||||
|
queryUrl = "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + number
|
||||||
|
queryResult = self.session.get(queryUrl)
|
||||||
|
|
||||||
|
if queryResult and "/?v=jav" in queryResult.url:
|
||||||
|
self.htmltree = etree.fromstring(queryResult.text, etree.HTMLParser())
|
||||||
|
return queryResult.url
|
||||||
|
else:
|
||||||
|
queryTree = etree.fromstring(queryResult.text, etree.HTMLParser())
|
||||||
|
numbers = queryTree.xpath('//div[@class="id"]/text()')
|
||||||
|
if number in numbers:
|
||||||
|
urls = queryTree.xpath('//div[@class="id"]/../@href')
|
||||||
|
detailurl = urls[numbers.index(number)]
|
||||||
|
return "http://www.javlibrary.com/cn" + detailurl.strip('.')
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
title = super().getTitle(htmltree)
|
||||||
|
title = title.replace(self.getNum(htmltree), '').strip()
|
||||||
|
return title
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
url = super().getCover(htmltree)
|
||||||
|
if not url.startswith('http'):
|
||||||
|
url = 'https:' + url
|
||||||
|
return url
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
if self.morestoryline:
|
||||||
|
from .storyline import getStoryline
|
||||||
|
return getStoryline(self.number, self.getUncensored(htmltree))
|
||||||
|
return ''
|
||||||
@@ -8,6 +8,7 @@ from .parser import Parser
|
|||||||
|
|
||||||
class Madou(Parser):
|
class Madou(Parser):
|
||||||
source = 'madou'
|
source = 'madou'
|
||||||
|
imagecut = 0
|
||||||
uncensored = True
|
uncensored = True
|
||||||
|
|
||||||
expr_url = '//a[@class="share-weixin"]/@data-url'
|
expr_url = '//a[@class="share-weixin"]/@data-url'
|
||||||
@@ -17,7 +18,10 @@ class Madou(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.lower().strip()
|
self.number = number.lower().strip()
|
||||||
self.detailurl = "https://madou.club/" + number + ".html"
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = "https://madou.club/" + number + ".html"
|
||||||
self.htmlcode = self.getHtml(self.detailurl)
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
if self.htmlcode == 404:
|
if self.htmlcode == 404:
|
||||||
return 404
|
return 404
|
||||||
|
|||||||
@@ -25,7 +25,10 @@ class Mgstage(Parser):
|
|||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number.upper()
|
self.number = number.upper()
|
||||||
self.cookies = {'adc':'1'}
|
self.cookies = {'adc':'1'}
|
||||||
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
|
||||||
htmltree =self.getHtmlTree(self.detailurl)
|
htmltree =self.getHtmlTree(self.detailurl)
|
||||||
result = self.dictformat(htmltree)
|
result = self.dictformat(htmltree)
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ from .parser import Parser
|
|||||||
|
|
||||||
class Mv91(Parser):
|
class Mv91(Parser):
|
||||||
source = 'mv91'
|
source = 'mv91'
|
||||||
|
imagecut = 0
|
||||||
|
uncensored = True
|
||||||
|
|
||||||
expr_number = '//div[@class="player-title"]/text()'
|
expr_number = '//div[@class="player-title"]/text()'
|
||||||
expr_title = '//div[@class="player-title"]/text()'
|
expr_title = '//div[@class="player-title"]/text()'
|
||||||
@@ -53,8 +55,8 @@ class Mv91(Parser):
|
|||||||
result = str(finds[0][0])
|
result = str(finds[0][0])
|
||||||
else:
|
else:
|
||||||
result = ' '.join(title.replace('/',' ').split())
|
result = ' '.join(title.replace('/',' ').split())
|
||||||
result = result.split()[0].replace('「预告」','')
|
result = result.split()[0]
|
||||||
return result.strip()
|
return result.replace('「预告」','').strip('/ ')
|
||||||
except:
|
except:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,10 @@ class Parser:
|
|||||||
""" 基础刮削类
|
""" 基础刮削类
|
||||||
"""
|
"""
|
||||||
source = 'base'
|
source = 'base'
|
||||||
# poster: `0` 复制 `1` 裁剪
|
# 推荐剪切poster封面:
|
||||||
|
# `0` 复制cover
|
||||||
|
# `1` 裁剪cover
|
||||||
|
# `3` 下载小封面
|
||||||
imagecut = 1
|
imagecut = 1
|
||||||
uncensored = False
|
uncensored = False
|
||||||
allow_number_change = False
|
allow_number_change = False
|
||||||
@@ -21,6 +24,7 @@ class Parser:
|
|||||||
extraheader = None
|
extraheader = None
|
||||||
cookies = None
|
cookies = None
|
||||||
morestoryline = False
|
morestoryline = False
|
||||||
|
specifiedUrl = None
|
||||||
|
|
||||||
number = ''
|
number = ''
|
||||||
detailurl = ''
|
detailurl = ''
|
||||||
@@ -61,8 +65,19 @@ class Parser:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
|
""" 查询番号
|
||||||
|
|
||||||
|
查询主要流程:
|
||||||
|
1. 获取 url
|
||||||
|
2. 获取详情页面
|
||||||
|
3. 解析
|
||||||
|
4. 返回 result
|
||||||
|
"""
|
||||||
self.number = number
|
self.number = number
|
||||||
self.detailurl = self.queryNumberUrl(number)
|
if self.specifiedUrl:
|
||||||
|
self.detailurl = self.specifiedUrl
|
||||||
|
else:
|
||||||
|
self.detailurl = self.queryNumberUrl(number)
|
||||||
htmltree = self.getHtmlTree(self.detailurl)
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
result = self.dictformat(htmltree)
|
result = self.dictformat(htmltree)
|
||||||
return result
|
return result
|
||||||
@@ -79,13 +94,16 @@ class Parser:
|
|||||||
self.verify = core.verify
|
self.verify = core.verify
|
||||||
if core.morestoryline:
|
if core.morestoryline:
|
||||||
self.morestoryline = True
|
self.morestoryline = True
|
||||||
|
if core.specifiedSource == self.source:
|
||||||
|
self.specifiedUrl = core.specifiedUrl
|
||||||
|
|
||||||
def queryNumberUrl(self, number):
|
def queryNumberUrl(self, number):
|
||||||
""" 根据番号查询详细信息url
|
""" 根据番号查询详细信息url
|
||||||
|
|
||||||
|
需要针对不同站点修改,或者在上层直接获取
|
||||||
备份查询页面,预览图可能需要
|
备份查询页面,预览图可能需要
|
||||||
"""
|
"""
|
||||||
url = httprequest.get(number)
|
url = "http://detailurl.ai/" + number
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def getHtml(self, url, type = None):
|
def getHtml(self, url, type = None):
|
||||||
@@ -115,26 +133,26 @@ class Parser:
|
|||||||
'number': self.getNum(htmltree),
|
'number': self.getNum(htmltree),
|
||||||
'title': self.getTitle(htmltree),
|
'title': self.getTitle(htmltree),
|
||||||
'studio': self.getStudio(htmltree),
|
'studio': self.getStudio(htmltree),
|
||||||
|
'release': self.getRelease(htmltree),
|
||||||
'year': self.getYear(htmltree),
|
'year': self.getYear(htmltree),
|
||||||
'outline': self.getOutline(htmltree),
|
'outline': self.getOutline(htmltree),
|
||||||
'runtime': self.getRuntime(htmltree),
|
'runtime': self.getRuntime(htmltree),
|
||||||
'director': self.getDirector(htmltree),
|
'director': self.getDirector(htmltree),
|
||||||
'actor': self.getActors(htmltree),
|
'actor': self.getActors(htmltree),
|
||||||
'release': self.getRelease(htmltree),
|
'actor_photo': self.getActorPhoto(htmltree),
|
||||||
'cover': self.getCover(htmltree),
|
'cover': self.getCover(htmltree),
|
||||||
'cover_small': self.getSmallCover(htmltree),
|
'cover_small': self.getSmallCover(htmltree),
|
||||||
'extrafanart': self.getExtrafanart(htmltree),
|
'extrafanart': self.getExtrafanart(htmltree),
|
||||||
'trailer': self.getTrailer(htmltree),
|
'trailer': self.getTrailer(htmltree),
|
||||||
'imagecut': self.imagecut,
|
|
||||||
'tag': self.getTags(htmltree),
|
'tag': self.getTags(htmltree),
|
||||||
'label': self.getLabel(htmltree),
|
'label': self.getLabel(htmltree),
|
||||||
'actor_photo': self.getActorPhoto(htmltree),
|
'series': self.getSeries(htmltree),
|
||||||
|
'userrating': self.getUserRating(htmltree),
|
||||||
|
'uservotes': self.getUserVotes(htmltree),
|
||||||
|
'uncensored': self.getUncensored(htmltree),
|
||||||
'website': self.detailurl,
|
'website': self.detailurl,
|
||||||
'source': self.source,
|
'source': self.source,
|
||||||
'series': self.getSeries(htmltree),
|
'imagecut': self.getImagecut(htmltree),
|
||||||
'uncensored': self.getUncensored(htmltree),
|
|
||||||
'userrating': self.getUserRating(htmltree),
|
|
||||||
'uservotes': self.getUserVotes(htmltree)
|
|
||||||
}
|
}
|
||||||
dic = self.extradict(dic)
|
dic = self.extradict(dic)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -215,11 +233,26 @@ class Parser:
|
|||||||
else:
|
else:
|
||||||
return self.uncensored
|
return self.uncensored
|
||||||
|
|
||||||
|
def getImagecut(self, htmlree):
|
||||||
|
""" 修正 无码poster不裁剪cover
|
||||||
|
"""
|
||||||
|
if self.imagecut == 1 and self.getUncensored(htmlree):
|
||||||
|
self.imagecut = 0
|
||||||
|
return self.imagecut
|
||||||
|
|
||||||
def getUserRating(self, htmltree):
|
def getUserRating(self, htmltree):
|
||||||
return self.getTreeElement(htmltree, self.expr_userrating)
|
numstrs = self.getTreeElement(htmltree, self.expr_userrating)
|
||||||
|
nums = re.findall('[0-9.]+', numstrs)
|
||||||
|
if len(nums) == 1:
|
||||||
|
return float(nums[0])
|
||||||
|
return ''
|
||||||
|
|
||||||
def getUserVotes(self, htmltree):
|
def getUserVotes(self, htmltree):
|
||||||
return self.getTreeElement(htmltree, self.expr_uservotes)
|
votestrs = self.getTreeElement(htmltree, self.expr_uservotes)
|
||||||
|
votes = re.findall('[0-9]+', votestrs)
|
||||||
|
if len(votes) == 1:
|
||||||
|
return int(votes[0])
|
||||||
|
return ''
|
||||||
|
|
||||||
def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
|
def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
|
||||||
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
|
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ import builtins
|
|||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from lxml.html import fromstring
|
from lxml.html import fromstring
|
||||||
from multiprocessing.dummy import Pool as ThreadPool
|
from multiprocessing.dummy import Pool as ThreadPool
|
||||||
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session
|
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session
|
||||||
|
|
||||||
# 舍弃 Amazon 源
|
# 舍弃 Amazon 源
|
||||||
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
|
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
|
||||||
@@ -112,7 +112,8 @@ def getStoryline_airav(number, debug):
|
|||||||
try:
|
try:
|
||||||
site = secrets.choice(('airav.cc','airav4.club'))
|
site = secrets.choice(('airav.cc','airav4.club'))
|
||||||
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
|
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
|
||||||
res, session = get_html_session(url, return_type='session')
|
session = request_session()
|
||||||
|
res = session.get(url)
|
||||||
if not res:
|
if not res:
|
||||||
raise ValueError(f"get_html_by_session('{url}') failed")
|
raise ValueError(f"get_html_by_session('{url}') failed")
|
||||||
lx = fromstring(res.text)
|
lx = fromstring(res.text)
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ class Tmdb(Parser):
|
|||||||
imagecut = 0
|
imagecut = 0
|
||||||
apikey = None
|
apikey = None
|
||||||
|
|
||||||
expr_title = '//head/meta[@property="og:title"]'
|
expr_title = '//head/meta[@property="og:title"]/@content'
|
||||||
expr_release = '//div/span[@class="release"]/text()'
|
expr_release = '//div/span[@class="release"]/text()'
|
||||||
expr_cover = '//head/meta[@property="og:image"]'
|
expr_cover = '//head/meta[@property="og:image"]/@content'
|
||||||
expr_outline = '//head/meta[@property="og:description"]'
|
expr_outline = '//head/meta[@property="og:description"]/@content'
|
||||||
|
|
||||||
# def search(self, number):
|
# def search(self, number):
|
||||||
# self.detailurl = self.queryNumberUrl(number)
|
# self.detailurl = self.queryNumberUrl(number)
|
||||||
@@ -30,11 +30,6 @@ class Tmdb(Parser):
|
|||||||
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
|
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
|
||||||
return movieUrl
|
return movieUrl
|
||||||
|
|
||||||
def getTitle(self, htmltree):
|
|
||||||
return self.getTreeElement(htmltree, self.expr_title).get('content')
|
|
||||||
|
|
||||||
def getCover(self, htmltree):
|
def getCover(self, htmltree):
|
||||||
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content')
|
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover)
|
||||||
|
|
||||||
def getOutline(self, htmltree):
|
|
||||||
return self.getTreeElement(htmltree, self.expr_outline).get('content')
|
|
||||||
|
|||||||
@@ -13,6 +13,9 @@ class Xcity(Parser):
|
|||||||
|
|
||||||
expr_number = '//*[@id="hinban"]/text()'
|
expr_number = '//*[@id="hinban"]/text()'
|
||||||
expr_title = '//*[@id="program_detail_title"]/text()'
|
expr_title = '//*[@id="program_detail_title"]/text()'
|
||||||
|
expr_actor = '//ul/li[@class="credit-links"]/a/text()'
|
||||||
|
expr_actor_link = '//ul/li[@class="credit-links"]/a'
|
||||||
|
expr_actorphoto = '//div[@class="frame"]/div/p/img/@src'
|
||||||
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
|
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
|
||||||
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
|
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
|
||||||
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
|
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
|
||||||
@@ -23,6 +26,7 @@ class Xcity(Parser):
|
|||||||
expr_director = '//*[@id="program_detail_director"]/text()'
|
expr_director = '//*[@id="program_detail_director"]/text()'
|
||||||
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
|
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
|
||||||
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
|
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
|
||||||
|
expr_extrafanart = '//div[@id="sample_images"]/div/a/@href'
|
||||||
|
|
||||||
def getStudio(self, htmltree):
|
def getStudio(self, htmltree):
|
||||||
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
|
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
|
||||||
@@ -57,41 +61,29 @@ class Xcity(Parser):
|
|||||||
return getStoryline(self.number, uncensored=False)
|
return getStoryline(self.number, uncensored=False)
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getActors(self, htmltree):
|
|
||||||
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
|
||||||
t = []
|
|
||||||
for i in htmla:
|
|
||||||
t.append(i.text.strip())
|
|
||||||
return t
|
|
||||||
|
|
||||||
def getActorPhoto(self, htmltree):
|
def getActorPhoto(self, htmltree):
|
||||||
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
treea = self.getTreeAll(htmltree, self.expr_actor_link)
|
||||||
t = {i.text.strip(): i['href'] for i in htmla}
|
t = {i.text.strip(): i.attrib['href'] for i in treea}
|
||||||
o = {}
|
o = {}
|
||||||
for k, v in t.items():
|
for k, v in t.items():
|
||||||
r = self.browser.open_relative(v)
|
actorpageUrl = "https://xcity.jp" + v
|
||||||
if not r.ok:
|
try:
|
||||||
continue
|
adtree = self.getHtmlTree(actorpageUrl)
|
||||||
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
|
picUrl = self.getTreeElement(adtree, self.expr_actorphoto)
|
||||||
if 'noimage.gif' in pic['src']:
|
if 'noimage.gif' in picUrl:
|
||||||
continue
|
continue
|
||||||
o[k] = urljoin(self.browser.url, pic['src'])
|
o[k] = urljoin("https://xcity.jp", picUrl)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
return o
|
return o
|
||||||
|
|
||||||
def getExtrafanart(self, htmltree):
|
def getExtrafanart(self, htmltree):
|
||||||
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
|
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
html = html_pather.search(self.detail_page)
|
extrafanart = []
|
||||||
if html:
|
for i in arts:
|
||||||
html = html.group()
|
i = "https:" + i
|
||||||
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
|
extrafanart.append(i)
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
return extrafanart
|
||||||
if extrafanart_imgs:
|
|
||||||
s = []
|
|
||||||
for urli in extrafanart_imgs:
|
|
||||||
urli = 'https:' + urli.replace('/scene/small', '')
|
|
||||||
s.append(urli)
|
|
||||||
return s
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def open_by_browser(self, number):
|
def open_by_browser(self, number):
|
||||||
xcity_number = number.replace('-','')
|
xcity_number = number.replace('-','')
|
||||||
@@ -108,8 +100,12 @@ class Xcity(Parser):
|
|||||||
|
|
||||||
def search(self, number):
|
def search(self, number):
|
||||||
self.number = number
|
self.number = number
|
||||||
self.detail_page, self.browser = self.open_by_browser(number)
|
if self.specifiedUrl:
|
||||||
self.detailurl = self.browser.url
|
self.detailurl = self.specifiedUrl
|
||||||
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
|
lx = self.getHtmlTree(self.detailurl)
|
||||||
|
else:
|
||||||
|
self.detail_page, self.browser = self.open_by_browser(number)
|
||||||
|
self.detailurl = self.browser.url
|
||||||
|
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
|
||||||
result = self.dictformat(lx)
|
result = self.dictformat(lx)
|
||||||
return result
|
return result
|
||||||
|
|||||||
Reference in New Issue
Block a user