update scrapinglib

- support specifiedUrl when scraping single movie
- support javlibrary and rating
This commit is contained in:
Mathhew
2022-07-28 18:45:54 +08:00
parent ee1306fb3b
commit ce388edce8
23 changed files with 379 additions and 176 deletions

View File

@@ -1,3 +1,3 @@
# -*- coding: utf-8 -*-
from .api import search
from .api import search, getSupportedSources

View File

@@ -8,6 +8,9 @@ from .javbus import Javbus
class Airav(Parser):
source = 'airav'
# for javbus
specifiedSource = None
addtion_Javbus = True
expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()'
@@ -21,9 +24,13 @@ class Airav(Parser):
def search(self, number):
self.number = number
self.detailurl = 'https://cn.airav.wiki/video/' + number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(self.number)
if self.addtion_Javbus:
engine = Javbus()
javbusinfo = engine.scrape(number, self)
javbusinfo = engine.scrape(self.number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
else:
@@ -33,8 +40,19 @@ class Airav(Parser):
result = self.dictformat(htmltree)
return result
def queryNumberUrl(self, number):
queryUrl = "https://cn.airav.wiki/?search=" + number
queryTree = self.getHtmlTree(queryUrl)
results = self.getTreeAll(queryTree, '//div[contains(@class,"videoList")]/div/a')
for i in results:
num = self.getTreeElement(i, '//div/div[contains(@class,"videoNumber")]/p[1]/text()')
if num.replace('-','') == number.replace('-','').upper():
self.number = num
return "https://cn.airav.wiki" + i.attrib['href']
return 'https://cn.airav.wiki/video/' + number
def getNum(self, htmltree):
# return super().getNum(htmltree)
if self.addtion_Javbus:
result = self.javbus.get('number')
if isinstance(result, str) and len(result):
return result
@@ -48,12 +66,14 @@ class Airav(Parser):
return result
def getStudio(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('studio')
if isinstance(result, str) and len(result):
return result
return super().getStudio(htmltree)
def getRelease(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('release')
if isinstance(result, str) and len(result):
return result
@@ -63,6 +83,7 @@ class Airav(Parser):
return ''
def getYear(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('year')
if isinstance(result, str) and len(result):
return result
@@ -73,38 +94,39 @@ class Airav(Parser):
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
def getRuntime(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
def getDirector(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getActors(self, htmltree):
b=[]
a = super().getActors(htmltree)
for v in a:
v = v.strip()
if len(v):
b.append(v)
b = [ i.strip() for i in a if len(i)]
if len(b):
return b
if self.addtion_Javbus:
result = self.javbus.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getCover(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('cover')
if isinstance(result, str) and len(result):
return result
return super().getCover(htmltree)
def getSeries(self, htmltree):
if self.addtion_Javbus:
result = self.javbus.get('series')
if isinstance(result, str) and len(result):
return result

View File

@@ -18,29 +18,45 @@ from .mgstage import Mgstage
from .javbus import Javbus
from .xcity import Xcity
from .avsox import Avsox
from .javlibrary import Javlibrary
from .tmdb import Tmdb
from .imdb import Imdb
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
specifiedSource=None, specifiedUrl=None,
dbcookies=None, dbsite=None, morestoryline=False):
""" 根据``番号/电影``名搜索信息
""" 根据`番号/电影`名搜索信息
:param number: number/name depends on type
:param sources: sources string with `,` like ``avsox,javbus``
:param type: ``adult``, ``general``
:param sources: sources string with `,` Eg: `avsox,javbus`
:param type: `adult`, `general`
"""
sc = Scraping()
return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
specifiedSource=specifiedSource, specifiedUrl=specifiedUrl,
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
def getSupportedSources(tag='adult'):
"""
:param tag: `adult`, `general`
"""
sc = Scraping()
if tag == 'adult':
return ','.join(sc.adult_full_sources)
else:
return ','.join(sc.general_full_sources)
class Scraping():
"""
"""
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
'gcolle', 'javdb', 'getchu']
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91',
'getchu', 'gcolle'
]
adult_func_mapping = {
'avsox': Avsox().scrape,
'javbus': Javbus().scrape,
@@ -57,15 +73,19 @@ class Scraping():
'gcolle': Gcolle().scrape,
'javdb': Javdb().scrape,
'getchu': Getchu().scrape,
'javlibrary': Javlibrary().scrape,
}
general_full_sources = ['tmdb']
general_full_sources = ['tmdb','imdb']
general_func_mapping = {
'tmdb': Tmdb().scrape,
'imdb': Imdb().scrape,
}
proxies = None
verify = None
specifiedSource = None
specifiedUrl = None
dbcookies = None
dbsite = None
@@ -73,9 +93,12 @@ class Scraping():
morestoryline = False
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
specifiedSource=None, specifiedUrl=None,
dbcookies=None, dbsite=None, morestoryline=False):
self.proxies = proxies
self.verify = verify
self.specifiedSource = specifiedSource
self.specifiedUrl = specifiedUrl
self.dbcookies = dbcookies
self.dbsite = dbsite
self.morestoryline = morestoryline
@@ -88,6 +111,9 @@ class Scraping():
""" 查询电影电视剧
imdb,tmdb
"""
if self.specifiedSource:
sources = [self.specifiedSource]
else:
sources = self.checkGeneralSources(sources, name)
json_data = {}
for source in sources:
@@ -116,6 +142,9 @@ class Scraping():
return json_data
def searchAdult(self, number, sources):
if self.specifiedSource:
sources = [self.specifiedSource]
else:
sources = self.checkAdultSources(sources, number)
json_data = {}
for source in sources:

View File

@@ -50,10 +50,14 @@ class Avsox(Parser):
def getSmallCover(self, htmltree):
""" 使用搜索页面的预览小图
"""
try:
return self.getTreeElement(self.searchtree, self.expr_smallcover)
except:
self.imagecut = 1
return ''
def getTags(self, htmltree):
tags = self.getTreeElement(htmltree).split(',')
tags = self.getTreeElement(htmltree, self.expr_tags).split(',')
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
def getOutline(self, htmltree):

View File

@@ -22,6 +22,9 @@ class Carib(Parser):
def search(self, number):
self.number = number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
htmlcode = self.getHtml(self.detailurl)
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:

View File

@@ -29,7 +29,12 @@ class Dlsite(Parser):
def search(self, number):
self.cookies = {'locale': 'zh-cn'}
if "RJ" in number or "VJ" in number:
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
# TODO 应该从页面内获取 number
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
htmltree = self.getHtmlTree(self.detailurl)
elif "RJ" in number or "VJ" in number:
self.number = number.upper()
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
htmltree = self.getHtmlTree(self.detailurl)

View File

@@ -11,15 +11,21 @@ class Fanza(Parser):
expr_title = '//*[starts-with(@id, "title")]/text()'
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
expr_cover = '//head/meta[@property="og:image"]'
expr_cover = './/head/meta[@property="og:image"]/@content'
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
expr_outline = "//div[@class='mg-b20 lh4']/text()"
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
expr_outline_og = '//head/meta[@property="og:description"]'
expr_outline_og = '//head/meta[@property="og:description"]/@content'
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
def search(self, number):
self.number = number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
durl = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl})
self.htmltree = self.getHtmlTree(durl)
result = self.dictformat(self.htmltree)
return result
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
@@ -75,7 +81,7 @@ class Fanza(Parser):
if result == '':
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
result = self.getTreeElement(htmltree, self.expr_outline_og)
return result
except:
return ''
@@ -99,9 +105,6 @@ class Fanza(Parser):
result = self.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getCover(self, htmltree):
return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content')
def getTags(self, htmltree):
return self.getFanzaStrings('ジャンル:')

View File

@@ -22,7 +22,10 @@ class Fc2(Parser):
expr_tags = "//a[@class='tag tagTag']/text()"
def search(self, number):
self.number = number.replace('FC2-', '').replace('fc2-', '')
self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '')
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:

View File

@@ -2,7 +2,7 @@
import re
from lxml import etree
from .httprequest import get_html_session
from .httprequest import request_session
from .parser import Parser
@@ -27,9 +27,12 @@ class Gcolle(Parser):
def search(self, number):
self.number = number.upper().replace('GCOLLE-','')
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + self.number).text
session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get(self.detailurl).text
htmltree = etree.HTML(htmlcode)
r18url = self.getTreeElement(htmltree, self.expr_r18)

View File

@@ -35,7 +35,7 @@ class wwwGetchu(Parser):
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()'
expr_cover = '//head/meta[@property="og:image"]'
expr_cover = '//head/meta[@property="og:image"]/@content'
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
@@ -67,9 +67,6 @@ class wwwGetchu(Parser):
def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
def getCover(self, htmltree):
return self.getTreeElement(htmltree, self.expr_cover).get('content')
def getActors(self, htmltree):
return super().getDirector(htmltree)

View File

@@ -9,8 +9,9 @@ from cloudscraper import create_scraper
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
G_DEFAULT_TIMEOUT = 10
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
def get(url: str, cookies=None, ua: str=None, extra_headers=None, return_type: str=None, encoding: str=None,
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
网页请求核心函数
@@ -43,8 +44,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
raise Exception('Connect Failed')
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
def post(url: str, data: dict, files=None, cookies=None, ua: str=None, return_type: str=None, encoding: str=None,
retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
是否使用代理应由上层处理
"""
@@ -74,11 +75,6 @@ def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_
raise Exception('Connect Failed')
#
# TODO: 以下临时使用,更新完各站后,再更新
#
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
self.timeout = G_DEFAULT_TIMEOUT
@@ -94,10 +90,10 @@ class TimeoutHTTPAdapter(HTTPAdapter):
return super().send(request, **kwargs)
# with keep-alive feature
# storyline carib gcolle javdb only
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
def request_session(cookies=None, ua: str=None, retry: int=3, timeout: int=G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
keep-alive
"""
session = requests.Session()
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
@@ -110,27 +106,8 @@ def get_html_session(url: str = None, cookies = None, ua: str = None, return_typ
if proxies:
session.proxies = proxies
session.headers = {"User-Agent": ua or G_USER_AGENT}
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用session对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "session":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
# storyline only
# 使用 cloudscraper....

24
scrapinglib/imdb.py Normal file
View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from .parser import Parser
class Imdb(Parser):
source = 'imdb'
imagecut = 0
expr_title = '//h1[@data-testid="hero-title-block__title"]/text()'
expr_release = '//a[contains(text(),"Release date")]/following-sibling::div[1]/ul/li/a/text()'
expr_cover = '//head/meta[@property="og:image"]/@content'
expr_outline = '//head/meta[@property="og:description"]/@content'
expr_actor = '//h3[contains(text(),"Top cast")]/../../../following-sibling::div[1]/div[2]/div/div/a/text()'
expr_tags = '//div[@data-testid="genres"]/div[2]/a/ul/li/text()'
def queryNumberUrl(self, number):
"""
TODO 区分 ID 与 名称
"""
id = number
movieUrl = "https://www.imdb.com/title/" + id
return movieUrl

View File

@@ -26,6 +26,14 @@ class Jav321(Parser):
return 'https://www.jav321.com/search'
def getHtmlTree(self, url):
"""
特殊处理 仅获取页面调用一次
"""
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
resp = httprequest.get(self.detailurl, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.detailhtml = resp
return etree.fromstring(resp, etree.HTMLParser())
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if "/video/" in resp.url:
self.detailurl = resp.url

View File

@@ -33,6 +33,11 @@ class Javbus(Parser):
def search(self, number):
self.number = number
try:
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
return result
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
@@ -61,6 +66,9 @@ class Javbus(Parser):
self.uncensored = True
w_number = number.replace('.', '-')
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://www.javbus.red/' + w_number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:

View File

@@ -4,7 +4,7 @@
import re
from urllib.parse import urljoin
from lxml import etree
from .httprequest import get_html_session
from .httprequest import request_session
from .parser import Parser
@@ -63,7 +63,10 @@ class Javdb(Parser):
def search(self, number: str):
self.number = number
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(number)
self.deatilpage = self.session.get(self.detailurl).text
if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
@@ -193,19 +196,19 @@ class Javdb(Parser):
def getUserRating(self, htmltree):
try:
result = str(self.getTreeElement(htmltree, self.expr_userrating))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0])
numstrs = self.getTreeElement(htmltree, self.expr_userrating)
nums = re.findall('[0-9.]+', numstrs)
return float(nums[0])
except:
return
return ''
def getUserVotes(self, htmltree):
try:
result = str(self.getTreeElement(htmltree, self.expr_uservotes))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return int(v[0][1])
result = self.getTreeElement(htmltree, self.expr_uservotes)
v = re.findall('[0-9.]+', result)
return int(v[1])
except:
return
return ''
def getaphoto(self, url, session):
html_page = session.get(url).text

80
scrapinglib/javlibrary.py Normal file
View File

@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
from lxml import etree
from .httprequest import request_session
from .parser import Parser
class Javlibrary(Parser):
source = 'javlibrary'
htmltree = None
expr_number = '//div[@id="video_id"]/table/tr/td[@class="text"]/text()'
expr_title = '//div[@id="video_title"]/h3/a/text()'
expr_actor = '//div[@id="video_cast"]/table/tr/td[@class="text"]/span/span[@class="star"]/a/text()'
expr_tags = '//div[@id="video_genres"]/table/tr/td[@class="text"]/span/a/text()'
expr_cover = '//img[@id="video_jacket_img"]/@src'
expr_release = '//div[@id="video_date"]/table/tr/td[@class="text"]/text()'
expr_studio = '//div[@id="video_maker"]/table/tr/td[@class="text"]/span/a/text()'
expr_runtime = '//div[@id="video_length"]/table/tr/td/span[@class="text"]/text()'
expr_userrating = '//div[@id="video_review"]/table/tr/td/span[@class="score"]/text()'
expr_director = '//div[@id="video_director"]/table/tr/td[@class="text"]/span/a/text()'
expr_extrafanart = '//div[@class="previewthumbs"]/img/@src'
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
self.cookies = {'over18':'1'}
def search(self, number):
self.number = number.upper()
self.session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(self.number)
if not self.detailurl:
return 404
if self.htmltree is None:
deatils = self.session.get(self.detailurl)
self.htmltree = etree.fromstring(deatils.text, etree.HTMLParser())
result = self.dictformat(self.htmltree)
return result
def queryNumberUrl(self, number:str):
queryUrl = "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword=" + number
queryResult = self.session.get(queryUrl)
if queryResult and "/?v=jav" in queryResult.url:
self.htmltree = etree.fromstring(queryResult.text, etree.HTMLParser())
return queryResult.url
else:
queryTree = etree.fromstring(queryResult.text, etree.HTMLParser())
numbers = queryTree.xpath('//div[@class="id"]/text()')
if number in numbers:
urls = queryTree.xpath('//div[@class="id"]/../@href')
detailurl = urls[numbers.index(number)]
return "http://www.javlibrary.com/cn" + detailurl.strip('.')
return None
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
title = title.replace(self.getNum(htmltree), '').strip()
return title
def getCover(self, htmltree):
url = super().getCover(htmltree)
if not url.startswith('http'):
url = 'https:' + url
return url
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree))
return ''

View File

@@ -8,6 +8,7 @@ from .parser import Parser
class Madou(Parser):
source = 'madou'
imagecut = 0
uncensored = True
expr_url = '//a[@class="share-weixin"]/@data-url'
@@ -17,6 +18,9 @@ class Madou(Parser):
def search(self, number):
self.number = number.lower().strip()
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = "https://madou.club/" + number + ".html"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:

View File

@@ -25,6 +25,9 @@ class Mgstage(Parser):
def search(self, number):
self.number = number.upper()
self.cookies = {'adc':'1'}
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
htmltree =self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)

View File

@@ -8,6 +8,8 @@ from .parser import Parser
class Mv91(Parser):
source = 'mv91'
imagecut = 0
uncensored = True
expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()'
@@ -53,8 +55,8 @@ class Mv91(Parser):
result = str(finds[0][0])
else:
result = ' '.join(title.replace('/',' ').split())
result = result.split()[0].replace('「预告」','')
return result.strip()
result = result.split()[0]
return result.replace('「预告」','').strip('/ ')
except:
return ''

View File

@@ -11,7 +11,10 @@ class Parser:
""" 基础刮削类
"""
source = 'base'
# poster: `0` 复制 `1` 裁剪
# 推荐剪切poster封面:
# `0` 复制cover
# `1` 裁剪cover
# `3` 下载小封面
imagecut = 1
uncensored = False
allow_number_change = False
@@ -21,6 +24,7 @@ class Parser:
extraheader = None
cookies = None
morestoryline = False
specifiedUrl = None
number = ''
detailurl = ''
@@ -61,7 +65,18 @@ class Parser:
return result
def search(self, number):
""" 查询番号
查询主要流程:
1. 获取 url
2. 获取详情页面
3. 解析
4. 返回 result
"""
self.number = number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
else:
self.detailurl = self.queryNumberUrl(number)
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
@@ -79,13 +94,16 @@ class Parser:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
if core.specifiedSource == self.source:
self.specifiedUrl = core.specifiedUrl
def queryNumberUrl(self, number):
""" 根据番号查询详细信息url
需要针对不同站点修改,或者在上层直接获取
备份查询页面,预览图可能需要
"""
url = httprequest.get(number)
url = "http://detailurl.ai/" + number
return url
def getHtml(self, url, type = None):
@@ -115,26 +133,26 @@ class Parser:
'number': self.getNum(htmltree),
'title': self.getTitle(htmltree),
'studio': self.getStudio(htmltree),
'release': self.getRelease(htmltree),
'year': self.getYear(htmltree),
'outline': self.getOutline(htmltree),
'runtime': self.getRuntime(htmltree),
'director': self.getDirector(htmltree),
'actor': self.getActors(htmltree),
'release': self.getRelease(htmltree),
'actor_photo': self.getActorPhoto(htmltree),
'cover': self.getCover(htmltree),
'cover_small': self.getSmallCover(htmltree),
'extrafanart': self.getExtrafanart(htmltree),
'trailer': self.getTrailer(htmltree),
'imagecut': self.imagecut,
'tag': self.getTags(htmltree),
'label': self.getLabel(htmltree),
'actor_photo': self.getActorPhoto(htmltree),
'series': self.getSeries(htmltree),
'userrating': self.getUserRating(htmltree),
'uservotes': self.getUserVotes(htmltree),
'uncensored': self.getUncensored(htmltree),
'website': self.detailurl,
'source': self.source,
'series': self.getSeries(htmltree),
'uncensored': self.getUncensored(htmltree),
'userrating': self.getUserRating(htmltree),
'uservotes': self.getUserVotes(htmltree)
'imagecut': self.getImagecut(htmltree),
}
dic = self.extradict(dic)
except Exception as e:
@@ -215,11 +233,26 @@ class Parser:
else:
return self.uncensored
def getImagecut(self, htmlree):
""" 修正 无码poster不裁剪cover
"""
if self.imagecut == 1 and self.getUncensored(htmlree):
self.imagecut = 0
return self.imagecut
def getUserRating(self, htmltree):
return self.getTreeElement(htmltree, self.expr_userrating)
numstrs = self.getTreeElement(htmltree, self.expr_userrating)
nums = re.findall('[0-9.]+', numstrs)
if len(nums) == 1:
return float(nums[0])
return ''
def getUserVotes(self, htmltree):
return self.getTreeElement(htmltree, self.expr_uservotes)
votestrs = self.getTreeElement(htmltree, self.expr_uservotes)
votes = re.findall('[0-9]+', votestrs)
if len(votes) == 1:
return int(votes[0])
return ''
def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0

View File

@@ -13,7 +13,7 @@ import builtins
from urllib.parse import urljoin
from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, request_session
# 舍弃 Amazon 源
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
@@ -112,7 +112,8 @@ def getStoryline_airav(number, debug):
try:
site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, session = get_html_session(url, return_type='session')
session = request_session()
res = session.get(url)
if not res:
raise ValueError(f"get_html_by_session('{url}') failed")
lx = fromstring(res.text)

View File

@@ -13,10 +13,10 @@ class Tmdb(Parser):
imagecut = 0
apikey = None
expr_title = '//head/meta[@property="og:title"]'
expr_title = '//head/meta[@property="og:title"]/@content'
expr_release = '//div/span[@class="release"]/text()'
expr_cover = '//head/meta[@property="og:image"]'
expr_outline = '//head/meta[@property="og:description"]'
expr_cover = '//head/meta[@property="og:image"]/@content'
expr_outline = '//head/meta[@property="og:description"]/@content'
# def search(self, number):
# self.detailurl = self.queryNumberUrl(number)
@@ -30,11 +30,6 @@ class Tmdb(Parser):
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
return movieUrl
def getTitle(self, htmltree):
return self.getTreeElement(htmltree, self.expr_title).get('content')
def getCover(self, htmltree):
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content')
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover)
def getOutline(self, htmltree):
return self.getTreeElement(htmltree, self.expr_outline).get('content')

View File

@@ -13,6 +13,9 @@ class Xcity(Parser):
expr_number = '//*[@id="hinban"]/text()'
expr_title = '//*[@id="program_detail_title"]/text()'
expr_actor = '//ul/li[@class="credit-links"]/a/text()'
expr_actor_link = '//ul/li[@class="credit-links"]/a'
expr_actorphoto = '//div[@class="frame"]/div/p/img/@src'
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
@@ -23,6 +26,7 @@ class Xcity(Parser):
expr_director = '//*[@id="program_detail_director"]/text()'
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
expr_extrafanart = '//div[@id="sample_images"]/div/a/@href'
def getStudio(self, htmltree):
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
@@ -57,41 +61,29 @@ class Xcity(Parser):
return getStoryline(self.number, uncensored=False)
return ''
def getActors(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = []
for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = {i.text.strip(): i['href'] for i in htmla}
treea = self.getTreeAll(htmltree, self.expr_actor_link)
t = {i.text.strip(): i.attrib['href'] for i in treea}
o = {}
for k, v in t.items():
r = self.browser.open_relative(v)
if not r.ok:
actorpageUrl = "https://xcity.jp" + v
try:
adtree = self.getHtmlTree(actorpageUrl)
picUrl = self.getTreeElement(adtree, self.expr_actorphoto)
if 'noimage.gif' in picUrl:
continue
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
if 'noimage.gif' in pic['src']:
continue
o[k] = urljoin(self.browser.url, pic['src'])
o[k] = urljoin("https://xcity.jp", picUrl)
except:
pass
return o
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
html = html_pather.search(self.detail_page)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for urli in extrafanart_imgs:
urli = 'https:' + urli.replace('/scene/small', '')
s.append(urli)
return s
return ''
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
extrafanart = []
for i in arts:
i = "https:" + i
extrafanart.append(i)
return extrafanart
def open_by_browser(self, number):
xcity_number = number.replace('-','')
@@ -108,6 +100,10 @@ class Xcity(Parser):
def search(self, number):
self.number = number
if self.specifiedUrl:
self.detailurl = self.specifiedUrl
lx = self.getHtmlTree(self.detailurl)
else:
self.detail_page, self.browser = self.open_by_browser(number)
self.detailurl = self.browser.url
lx = etree.fromstring(self.detail_page, etree.HTMLParser())