@@ -1,227 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from bs4 import BeautifulSoup#need install
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler import javbus
|
|
||||||
|
|
||||||
'''
|
|
||||||
API
|
|
||||||
注册:https://www.airav.wiki/api/auth/signup
|
|
||||||
设置:https://www.airav.wiki/api/get_web_settings
|
|
||||||
搜索:https://www.airav.wiki/api/video/list?lng=zh-CN&search=
|
|
||||||
搜索:https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search=
|
|
||||||
'''
|
|
||||||
host = 'https://www.airav.wiki'
|
|
||||||
|
|
||||||
# airav这个网站没有演员图片,所以直接使用javbus的图
|
|
||||||
def getActorPhoto(javbus_json):
|
|
||||||
result = javbus_json.get('actor_photo')
|
|
||||||
if isinstance(result, dict) and len(result):
|
|
||||||
return result
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTitle(htmlcode): #获取标题
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
title = str(html.xpath('/html/head/title/text()')[0])
|
|
||||||
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getStudio(htmlcode, javbus_json): #获取厂商 已修改
|
|
||||||
# javbus如果有数据以它为准
|
|
||||||
result = javbus_json.get('studio')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
|
|
||||||
def getYear(htmlcode, javbus_json): #获取年份
|
|
||||||
result = javbus_json.get('year')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
release = getRelease(htmlcode, javbus_json)
|
|
||||||
if len(release) != len('2000-01-01'):
|
|
||||||
return ''
|
|
||||||
return release[:4]
|
|
||||||
def getCover(htmlcode, javbus_json): #获取封面图片
|
|
||||||
result = javbus_json.get('cover')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
|
|
||||||
def getRelease(htmlcode, javbus_json): #获取出版日期
|
|
||||||
result = javbus_json.get('release')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
return result
|
|
||||||
def getRuntime(javbus_json): #获取播放时长
|
|
||||||
result = javbus_json.get('runtime')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
return ''
|
|
||||||
# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先
|
|
||||||
def getActor(htmlcode, javbus_json): #获取女优
|
|
||||||
b=[]
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
|
|
||||||
for v in a:
|
|
||||||
v = v.strip()
|
|
||||||
if len(v):
|
|
||||||
b.append(v)
|
|
||||||
if len(b):
|
|
||||||
return b
|
|
||||||
result = javbus_json.get('actor')
|
|
||||||
if isinstance(result, list) and len(result):
|
|
||||||
return result
|
|
||||||
return []
|
|
||||||
def getNum(htmlcode, javbus_json): #获取番号
|
|
||||||
result = javbus_json.get('number')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
title = str(html.xpath('/html/head/title/text()')[0])
|
|
||||||
result = str(re.findall('^\[(.*?)]', title)[0])
|
|
||||||
return result
|
|
||||||
def getDirector(javbus_json): #获取导演 已修改
|
|
||||||
result = javbus_json.get('director')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
return ''
|
|
||||||
def getOutline(htmlcode): #获取概述
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
try:
|
|
||||||
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getSerise(javbus_json): #获取系列 已修改
|
|
||||||
result = javbus_json.get('series')
|
|
||||||
if isinstance(result, str) and len(result):
|
|
||||||
return result
|
|
||||||
return ''
|
|
||||||
def getTag(htmlcode): # 获取标签
|
|
||||||
tag = []
|
|
||||||
soup = BeautifulSoup(htmlcode, 'lxml')
|
|
||||||
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
|
|
||||||
a = x[0].find_all('a')
|
|
||||||
|
|
||||||
for i in a:
|
|
||||||
tag.append(i.get_text())
|
|
||||||
return tag
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
|
|
||||||
html = html_pather.search(htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return extrafanart_imgs
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def search(keyword): #搜索,返回结果
|
|
||||||
result = []
|
|
||||||
page = 1
|
|
||||||
while page > 0:
|
|
||||||
# search_result = {"offset": 0,"count": 4,"result": [
|
|
||||||
# {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
|
|
||||||
# "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"},
|
|
||||||
# {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜",
|
|
||||||
# "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"},
|
|
||||||
# {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希",
|
|
||||||
# "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"},
|
|
||||||
# {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
|
|
||||||
# "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"}
|
|
||||||
# ],"status": "ok"}
|
|
||||||
search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page))
|
|
||||||
|
|
||||||
try:
|
|
||||||
json_data = json.loads(search_result)
|
|
||||||
except json.decoder.JSONDecodeError:
|
|
||||||
# print("[-]Json decoder error!")
|
|
||||||
return []
|
|
||||||
|
|
||||||
result_offset = int(json_data["offset"])
|
|
||||||
result_count = int(json_data["count"])
|
|
||||||
result_size = len(json_data["result"])
|
|
||||||
if result_count <= 0 or result_size <= 0:
|
|
||||||
return result
|
|
||||||
elif result_count > result_offset + result_size: #请求下一页内容
|
|
||||||
result.extend(json_data["result"])
|
|
||||||
page += 1
|
|
||||||
elif result_count == result_offset + result_size: #请求最后一页内容
|
|
||||||
result.extend(json_data["result"])
|
|
||||||
page = 0
|
|
||||||
else:
|
|
||||||
page = 0
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
|
|
||||||
javbus_json = json.loads(javbus.main(number))
|
|
||||||
|
|
||||||
except:
|
|
||||||
# print(number)
|
|
||||||
pass
|
|
||||||
|
|
||||||
dic = {
|
|
||||||
# 标题可使用airav
|
|
||||||
'title': getTitle(htmlcode),
|
|
||||||
# 制作商先找javbus,如果没有再找本站
|
|
||||||
'studio': getStudio(htmlcode, javbus_json),
|
|
||||||
# 年份先试javbus,如果没有再找本站
|
|
||||||
'year': getYear(htmlcode, javbus_json),
|
|
||||||
# 简介 使用 airav
|
|
||||||
'outline': getOutline(htmlcode),
|
|
||||||
# 使用javbus
|
|
||||||
'runtime': getRuntime(javbus_json),
|
|
||||||
# 导演 使用javbus
|
|
||||||
'director': getDirector(javbus_json),
|
|
||||||
# 演员 先试airav
|
|
||||||
'actor': getActor(htmlcode, javbus_json),
|
|
||||||
# 发售日先试javbus
|
|
||||||
'release': getRelease(htmlcode, javbus_json),
|
|
||||||
# 番号使用javbus
|
|
||||||
'number': getNum(htmlcode, javbus_json),
|
|
||||||
# 封面链接 使用javbus
|
|
||||||
'cover': getCover(htmlcode, javbus_json),
|
|
||||||
# 剧照获取
|
|
||||||
'extrafanart': getExtrafanart(htmlcode),
|
|
||||||
'imagecut': 1,
|
|
||||||
# 使用 airav
|
|
||||||
'tag': getTag(htmlcode),
|
|
||||||
# 使用javbus
|
|
||||||
'label': getSerise(javbus_json),
|
|
||||||
'actor_photo': getActorPhoto(javbus_json),
|
|
||||||
'website': 'https://www.airav.wiki/video/' + number,
|
|
||||||
'source': 'airav.py',
|
|
||||||
# 使用javbus
|
|
||||||
'series': getSerise(javbus_json)
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
)
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
config.getInstance().set_override("actor_photo:download_for_kodi=1")
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
print(main('ADV-R0624')) # javbus页面返回404, airav有数据
|
|
||||||
print(main('ADN-188')) # 一人
|
|
||||||
print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字
|
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('..')
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.storyline import getStoryline
|
|
||||||
from WebCrawler.crawler import *
|
|
||||||
# import io
|
|
||||||
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
|
|
||||||
|
|
||||||
def getActorPhoto(html):
|
|
||||||
a = html.xpath('//a[@class="avatar-box"]')
|
|
||||||
d = {}
|
|
||||||
for i in a:
|
|
||||||
l = i.find('.//img').attrib['src']
|
|
||||||
t = i.find('span').text
|
|
||||||
p2 = {t: l}
|
|
||||||
d.update(p2)
|
|
||||||
return d
|
|
||||||
|
|
||||||
def getActor(html):
|
|
||||||
a = html.xpath('//a[@class="avatar-box"]')
|
|
||||||
d = []
|
|
||||||
for i in a:
|
|
||||||
d.append(i.find('span').text)
|
|
||||||
return d
|
|
||||||
|
|
||||||
def getCover_small(html):
|
|
||||||
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getTag(html):
|
|
||||||
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
|
|
||||||
return [i.strip() for i in x[2:]] if len(x) > 2 else []
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
html = get_html('https://tellme.pw/avsox')
|
|
||||||
site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
|
|
||||||
a = get_html(site + '/cn/search/' + number)
|
|
||||||
html = Crawler(a)
|
|
||||||
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
|
|
||||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
|
||||||
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
|
|
||||||
html = Crawler(a)
|
|
||||||
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
|
|
||||||
if result1 == '' or result1 == 'null' or result1 == 'None':
|
|
||||||
a = get_html(site + '/cn/search/' + number.replace('_', ''))
|
|
||||||
html = Crawler(a)
|
|
||||||
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
|
|
||||||
detail = get_html("https:" + result1)
|
|
||||||
lx = etree.fromstring(detail, etree.HTMLParser())
|
|
||||||
avsox_crawler2 = Crawler(a)
|
|
||||||
avsox_crawler = Crawler(detail)
|
|
||||||
try:
|
|
||||||
new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
|
|
||||||
if new_number.upper() != number.upper():
|
|
||||||
raise ValueError('number not found')
|
|
||||||
title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
|
|
||||||
dic = {
|
|
||||||
'actor': getActor(lx),
|
|
||||||
'title': title,
|
|
||||||
'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
|
|
||||||
'outline': getStoryline(number, title),
|
|
||||||
'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
|
|
||||||
'director': '', #
|
|
||||||
'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
|
|
||||||
'number': new_number,
|
|
||||||
'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
|
|
||||||
#'cover_small' : getCover_small(html),
|
|
||||||
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
|
|
||||||
'imagecut': 3,
|
|
||||||
'tag': getTag(lx),
|
|
||||||
'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
|
|
||||||
'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
|
|
||||||
'actor_photo': getActorPhoto(lx),
|
|
||||||
'website': "https:" + result1,
|
|
||||||
'source': 'avsox.py',
|
|
||||||
'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main('012717_472'))
|
|
||||||
print(main('1')) # got fake result raise 'number not found'
|
|
||||||
@@ -1,133 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from lxml import html
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.storyline import getStoryline
|
|
||||||
|
|
||||||
|
|
||||||
G_SITE = 'https://www.caribbeancom.com'
|
|
||||||
|
|
||||||
|
|
||||||
def main(number: str) -> json:
|
|
||||||
try:
|
|
||||||
url = f'{G_SITE}/moviepages/{number}/index.html'
|
|
||||||
result, session = get_html_session(url, return_type='session')
|
|
||||||
htmlcode = result.content.decode('euc-jp')
|
|
||||||
if not result or not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
|
|
||||||
raise ValueError("page not found")
|
|
||||||
|
|
||||||
lx = html.fromstring(htmlcode)
|
|
||||||
title = get_title(lx)
|
|
||||||
|
|
||||||
dic = {
|
|
||||||
'title': title,
|
|
||||||
'studio': '加勒比',
|
|
||||||
'year': get_year(lx),
|
|
||||||
'outline': get_outline(lx, number, title),
|
|
||||||
'runtime': get_runtime(lx),
|
|
||||||
'director': '',
|
|
||||||
'actor': get_actor(lx),
|
|
||||||
'release': get_release(lx),
|
|
||||||
'number': number,
|
|
||||||
'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
|
|
||||||
'tag': get_tag(lx),
|
|
||||||
'extrafanart': get_extrafanart(lx),
|
|
||||||
'label': get_series(lx),
|
|
||||||
'imagecut': 1,
|
|
||||||
'website': f'{G_SITE}/moviepages/{number}/index.html',
|
|
||||||
'source': 'carib.py',
|
|
||||||
'series': get_series(lx),
|
|
||||||
'无码': True
|
|
||||||
}
|
|
||||||
if config.getInstance().download_actor_photo_for_kodi():
|
|
||||||
dic['actor_photo'] = get_actor_photo(lx, session)
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
|
||||||
return js
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(lx: html.HtmlElement) -> str:
|
|
||||||
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
|
|
||||||
|
|
||||||
def get_year(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
|
|
||||||
|
|
||||||
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
|
|
||||||
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
|
|
||||||
g = getStoryline(number, title, 无码=True)
|
|
||||||
if len(g):
|
|
||||||
return g
|
|
||||||
return o
|
|
||||||
|
|
||||||
def get_release(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
|
|
||||||
|
|
||||||
def get_actor(lx: html.HtmlElement):
|
|
||||||
r = []
|
|
||||||
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
|
|
||||||
for act in actors:
|
|
||||||
if str(act) != '他':
|
|
||||||
r.append(act)
|
|
||||||
return r
|
|
||||||
|
|
||||||
def get_tag(lx: html.HtmlElement) -> str:
|
|
||||||
genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
|
|
||||||
return genres
|
|
||||||
|
|
||||||
def get_extrafanart(lx: html.HtmlElement) -> str:
|
|
||||||
r = []
|
|
||||||
genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
|
|
||||||
for g in genres:
|
|
||||||
jpg = str(g)
|
|
||||||
if '/member/' in jpg:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
r.append('https://www.caribbeancom.com' + jpg)
|
|
||||||
return r
|
|
||||||
|
|
||||||
def get_series(lx: html.HtmlElement) -> str:
|
|
||||||
try:
|
|
||||||
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def get_runtime(lx: html.HtmlElement) -> str:
|
|
||||||
return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
|
|
||||||
|
|
||||||
def get_actor_photo(lx, session):
|
|
||||||
htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
|
|
||||||
names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
|
|
||||||
t = {}
|
|
||||||
for name, a in zip(names, htmla):
|
|
||||||
if name.strip() == '他':
|
|
||||||
continue
|
|
||||||
p = {name.strip(): a.attrib['href']}
|
|
||||||
t.update(p)
|
|
||||||
o = {}
|
|
||||||
for k, v in t.items():
|
|
||||||
if '/search_act/' not in v:
|
|
||||||
continue
|
|
||||||
r = session.get(urljoin(G_SITE, v))
|
|
||||||
if not r.ok:
|
|
||||||
continue
|
|
||||||
html = r.text
|
|
||||||
pos = html.find('.full-bg')
|
|
||||||
if pos<0:
|
|
||||||
continue
|
|
||||||
css = html[pos:pos+100]
|
|
||||||
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
|
|
||||||
if not cssBGjpgs or not len(cssBGjpgs[0]):
|
|
||||||
continue
|
|
||||||
p = {k: urljoin(r.url, cssBGjpgs[0])}
|
|
||||||
o.update(p)
|
|
||||||
return o
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main("070116-197")) # actor have photo
|
|
||||||
print(main("041721-001"))
|
|
||||||
print(main("080520-001"))
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
from lxml import etree
|
|
||||||
|
|
||||||
class Crawler:
|
|
||||||
def __init__(self,htmlcode):
|
|
||||||
self.html = etree.HTML(htmlcode)
|
|
||||||
|
|
||||||
def getString(self,_xpath):
|
|
||||||
if _xpath == "":
|
|
||||||
return ""
|
|
||||||
result = self.html.xpath(_xpath)
|
|
||||||
try:
|
|
||||||
return result[0]
|
|
||||||
except:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def getStrings(self,_xpath):
|
|
||||||
result = self.html.xpath(_xpath)
|
|
||||||
try:
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def getOutline(self,_xpath):
|
|
||||||
result = self.html.xpath(_xpath)
|
|
||||||
try:
|
|
||||||
return "\n".join(result)
|
|
||||||
except:
|
|
||||||
return ""
|
|
||||||
@@ -1,185 +0,0 @@
|
|||||||
import re
|
|
||||||
from lxml import etree
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
|
|
||||||
def getTitle(html):
|
|
||||||
result = str(html.xpath('/html/head/title/text()')[0])
|
|
||||||
result = result[:result.rfind(' | DLsite')]
|
|
||||||
result = result[:result.rfind(' [')]
|
|
||||||
result = result.replace('【HD版】', '')
|
|
||||||
return result
|
|
||||||
def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
|
|
||||||
try:
|
|
||||||
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
|
|
||||||
except:
|
|
||||||
result1 = ''
|
|
||||||
return result1
|
|
||||||
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
|
|
||||||
a = actor.split(',')
|
|
||||||
d={}
|
|
||||||
for i in a:
|
|
||||||
p={i:''}
|
|
||||||
d.update(p)
|
|
||||||
return d
|
|
||||||
def getStudio(html):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
result = html.xpath('//th[contains(text(),"商标名")]/../td/span[1]/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
def getRuntime(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
|
||||||
def getLabel(html):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
def getYear(getRelease):
|
|
||||||
try:
|
|
||||||
result = str(re.search('\d{4}', getRelease).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return getRelease
|
|
||||||
def getRelease(html):
|
|
||||||
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
|
|
||||||
return result1.replace('年','-').replace('月','-').replace('日','')
|
|
||||||
def getTag(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getCover_small(a, index=0):
|
|
||||||
# same issue mentioned below,
|
|
||||||
# javdb sometime returns multiple results
|
|
||||||
# DO NOT just get the firt one, get the one with correct index number
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
try:
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
except: # 2020.7.17 Repair Cover Url crawl
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
def getCover(html):
|
|
||||||
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
|
|
||||||
return result.replace('.webp', '.jpg')
|
|
||||||
def getDirector(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
def getOutline(html):
|
|
||||||
total = []
|
|
||||||
result = html.xpath('//*[@class="work_parts_area"]/p/text()')
|
|
||||||
for i in result:
|
|
||||||
total.append(i.strip('\r\n'))
|
|
||||||
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
|
||||||
def getSeries(html):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
|
|
||||||
except:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
#
|
|
||||||
def getExtrafanart(html):
|
|
||||||
try:
|
|
||||||
result = []
|
|
||||||
for i in html.xpath('//*[@id="work_left"]/div/div/div[1]/div/@data-src'):
|
|
||||||
result.append("https:" + i)
|
|
||||||
except:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
if "RJ" in number or "VJ" in number:
|
|
||||||
number = number.upper()
|
|
||||||
htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN', cookies={'locale': 'zh-cn'})
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
else:
|
|
||||||
htmlcode = get_html(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})
|
|
||||||
html = etree.HTML(htmlcode)
|
|
||||||
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
|
||||||
if len(search_result) == 0:
|
|
||||||
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
|
|
||||||
html = etree.HTML(get_html(
|
|
||||||
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
|
|
||||||
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
|
||||||
if len(search_result) == 0:
|
|
||||||
if "~" in number:
|
|
||||||
number = number.replace("~","〜")
|
|
||||||
elif "〜" in number:
|
|
||||||
number = number.replace("〜","~")
|
|
||||||
html = etree.HTML(get_html(
|
|
||||||
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
|
|
||||||
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
|
||||||
if len(search_result) == 0:
|
|
||||||
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
|
|
||||||
html = etree.HTML(get_html(
|
|
||||||
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
|
|
||||||
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
|
||||||
a = search_result[0]
|
|
||||||
html = etree.HTML(get_html(a,cookies={'locale': 'zh-cn'}))
|
|
||||||
number = str(re.findall("\wJ\w+",a)).strip(" [']")
|
|
||||||
dic = {
|
|
||||||
'actor': getStudio(html),
|
|
||||||
'title': getTitle(html),
|
|
||||||
'studio': getStudio(html),
|
|
||||||
'outline': getOutline(html),
|
|
||||||
'runtime': '',
|
|
||||||
'director': getDirector(html),
|
|
||||||
'release': getRelease(html),
|
|
||||||
'number': number,
|
|
||||||
'cover': 'https:' + getCover(html),
|
|
||||||
'cover_small': '',
|
|
||||||
'imagecut': 4,
|
|
||||||
'tag': getTag(html),
|
|
||||||
'label': getLabel(html),
|
|
||||||
'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'actor_photo': '',
|
|
||||||
'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
|
|
||||||
'source': 'dlsite.py',
|
|
||||||
'series': getSeries(html),
|
|
||||||
'extrafanart':getExtrafanart(html),
|
|
||||||
'allow_number_change':True,
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
)
|
|
||||||
return js
|
|
||||||
|
|
||||||
# main('DV-1562')
|
|
||||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
|
||||||
if __name__ == "__main__":
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
print(main('牝教師4~穢された教壇~ 「生意気ドジっ娘女教師・美結~高飛車ハメ堕ち2濁金」'))
|
|
||||||
print(main('RJ329607'))
|
|
||||||
@@ -1,205 +0,0 @@
|
|||||||
#!/usr/bin/python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from urllib.parse import urlencode
|
|
||||||
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.crawler import *
|
|
||||||
|
|
||||||
class fanzaCrawler(Crawler):
|
|
||||||
def getFanzaString(self,string):
|
|
||||||
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
|
|
||||||
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
|
|
||||||
return result1+result2
|
|
||||||
|
|
||||||
def getFanzaStrings(self, string):
|
|
||||||
result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
|
|
||||||
if len(result1) > 0:
|
|
||||||
return result1
|
|
||||||
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
|
|
||||||
return result2
|
|
||||||
|
|
||||||
|
|
||||||
def getRelease(fanza_Crawler):
|
|
||||||
result = fanza_Crawler.getFanzaString('発売日:')
|
|
||||||
if result == '' or result == '----':
|
|
||||||
result = fanza_Crawler.getFanzaString('配信開始日:')
|
|
||||||
return result.replace("/", "-").strip('\\n')
|
|
||||||
|
|
||||||
|
|
||||||
def getCover(html, number):
|
|
||||||
cover_number = number
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
|
||||||
except:
|
|
||||||
# sometimes fanza modify _ to \u0005f for image id
|
|
||||||
if "_" in cover_number:
|
|
||||||
cover_number = cover_number.replace("_", r"\u005f")
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
|
|
||||||
except:
|
|
||||||
# (TODO) handle more edge case
|
|
||||||
# print(html)
|
|
||||||
# raise exception here, same behavior as before
|
|
||||||
# people's major requirement is fetching the picture
|
|
||||||
raise ValueError("can not find image")
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getOutline(html):
|
|
||||||
try:
|
|
||||||
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
|
|
||||||
if result == "":
|
|
||||||
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
|
|
||||||
except:
|
|
||||||
# (TODO) handle more edge case
|
|
||||||
# print(html)
|
|
||||||
return ""
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
|
|
||||||
html = html_pather.search(htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
s = []
|
|
||||||
for img_url in extrafanart_imgs:
|
|
||||||
img_urls = img_url.rsplit('-', 1)
|
|
||||||
img_url = img_urls[0] + 'jp-' + img_urls[1]
|
|
||||||
s.append(img_url)
|
|
||||||
return s
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
# fanza allow letter + number + underscore, normalize the input here
|
|
||||||
# @note: I only find the usage of underscore as h_test123456789
|
|
||||||
fanza_search_number = number
|
|
||||||
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
|
||||||
if fanza_search_number.startswith("h-"):
|
|
||||||
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
|
||||||
|
|
||||||
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
|
||||||
|
|
||||||
fanza_urls = [
|
|
||||||
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
|
|
||||||
]
|
|
||||||
chosen_url = ""
|
|
||||||
fanza_Crawler = ''
|
|
||||||
|
|
||||||
for url in fanza_urls:
|
|
||||||
chosen_url = url + fanza_search_number
|
|
||||||
htmlcode = get_html(
|
|
||||||
"https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format(
|
|
||||||
urlencode({"rurl": chosen_url})
|
|
||||||
)
|
|
||||||
)
|
|
||||||
fanza_Crawler = fanzaCrawler(htmlcode)
|
|
||||||
if "404 Not Found" not in htmlcode:
|
|
||||||
break
|
|
||||||
if "404 Not Found" in htmlcode:
|
|
||||||
return json.dumps({"title": "",})
|
|
||||||
try:
|
|
||||||
# for some old page, the input number does not match the page
|
|
||||||
# for example, the url will be cid=test012
|
|
||||||
# but the hinban on the page is test00012
|
|
||||||
# so get the hinban first, and then pass it to following functions
|
|
||||||
fanza_hinban = fanza_Crawler.getFanzaString('品番:')
|
|
||||||
out_num = fanza_hinban
|
|
||||||
number_lo = number.lower()
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
if (re.sub('-|_', '', number_lo) == fanza_hinban or
|
|
||||||
number_lo.replace('-', '00') == fanza_hinban or
|
|
||||||
number_lo.replace('-', '') + 'so' == fanza_hinban
|
|
||||||
):
|
|
||||||
out_num = number
|
|
||||||
|
|
||||||
director = fanza_Crawler.getFanzaString('監督:')
|
|
||||||
if "anime" in chosen_url:
|
|
||||||
director = ""
|
|
||||||
actor = fanza_Crawler.getStrings("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()")
|
|
||||||
if "anime" in chosen_url:
|
|
||||||
actor = ""
|
|
||||||
# ----
|
|
||||||
series = fanza_Crawler.getFanzaString('シリーズ:')
|
|
||||||
if series == "----":
|
|
||||||
series = ""
|
|
||||||
label = fanza_Crawler.getFanzaString('レーベル')
|
|
||||||
if label == "----":
|
|
||||||
label = ""
|
|
||||||
|
|
||||||
data = {
|
|
||||||
"title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
|
|
||||||
"studio": fanza_Crawler.getFanzaString('メーカー'),
|
|
||||||
"outline": getOutline(html),
|
|
||||||
"runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
|
|
||||||
"director": director,
|
|
||||||
"actor": actor,
|
|
||||||
"release": getRelease(fanza_Crawler),
|
|
||||||
"number": out_num,
|
|
||||||
"cover": getCover(html, fanza_hinban),
|
|
||||||
"imagecut": 1,
|
|
||||||
"tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
|
|
||||||
"extrafanart": getExtrafanart(htmlcode),
|
|
||||||
"label": label,
|
|
||||||
"year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": chosen_url,
|
|
||||||
"source": "fanza.py",
|
|
||||||
"series": series,
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
def main_htmlcode(number):
|
|
||||||
# fanza allow letter + number + underscore, normalize the input here
|
|
||||||
# @note: I only find the usage of underscore as h_test123456789
|
|
||||||
fanza_search_number = number
|
|
||||||
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
|
||||||
if fanza_search_number.startswith("h-"):
|
|
||||||
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
|
||||||
|
|
||||||
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
|
||||||
|
|
||||||
fanza_urls = [
|
|
||||||
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
|
||||||
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
|
||||||
]
|
|
||||||
chosen_url = ""
|
|
||||||
for url in fanza_urls:
|
|
||||||
chosen_url = url + fanza_search_number
|
|
||||||
htmlcode = get_html(chosen_url)
|
|
||||||
if "404 Not Found" not in htmlcode:
|
|
||||||
break
|
|
||||||
if "404 Not Found" in htmlcode:
|
|
||||||
return json.dumps({"title": "",})
|
|
||||||
return htmlcode
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# print(main("DV-1562"))
|
|
||||||
# print(main("96fad1217"))
|
|
||||||
print(main("AES-002"))
|
|
||||||
print(main("MIAA-391"))
|
|
||||||
print(main("OBA-326"))
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
import re
|
|
||||||
import json
|
|
||||||
import config
|
|
||||||
import ADC_function
|
|
||||||
from WebCrawler.crawler import *
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
|
|
||||||
html = html_pather.search(htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<a href=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return extrafanart_imgs
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTrailer(htmlcode, number):
|
|
||||||
video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'')
|
|
||||||
video = video_pather.findall(htmlcode)
|
|
||||||
if video:
|
|
||||||
try:
|
|
||||||
video_url = video[0].replace('\'', '')
|
|
||||||
video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + number + '/sample?key=' + video_url
|
|
||||||
url_json = eval(ADC_function.get_html(video_url))['path'].replace('\\', '')
|
|
||||||
return url_json
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
number = number.replace('FC2-', '').replace('fc2-', '')
|
|
||||||
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
|
|
||||||
fc2_crawler = Crawler(htmlcode2)
|
|
||||||
actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
|
|
||||||
if actor == "":
|
|
||||||
actor = '素人'
|
|
||||||
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
|
|
||||||
cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
|
|
||||||
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
|
|
||||||
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
|
|
||||||
strip(" ['販売日 : ']").replace('/','-')
|
|
||||||
dic = {
|
|
||||||
'title': fc2_crawler.getString('/html/head/title/text()'),
|
|
||||||
'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
|
|
||||||
'year': re.findall('\d{4}',release)[0],
|
|
||||||
'outline': '', # getOutline_fc2com(htmlcode2),
|
|
||||||
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
|
|
||||||
'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
|
|
||||||
'actor': actor,
|
|
||||||
'release': release,
|
|
||||||
'number': 'FC2-' + number,
|
|
||||||
'label': '',
|
|
||||||
'cover': cover,
|
|
||||||
'thumb': cover,
|
|
||||||
'extrafanart': getExtrafanart(htmlcode2),
|
|
||||||
"trailer": getTrailer(htmlcode2, number),
|
|
||||||
'imagecut': 0,
|
|
||||||
'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
|
|
||||||
'actor_photo': '',
|
|
||||||
'website': 'https://adult.contents.fc2.com/article/' + number + '/',
|
|
||||||
'source': 'https://adult.contents.fc2.com/article/' + number + '/',
|
|
||||||
'series': '',
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
if ADC_function.config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
#print(main('FC2-2182382'))
|
|
||||||
#print(main('FC2-607854'))
|
|
||||||
print(main('FC2-2787433'))
|
|
||||||
@@ -1,96 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from lxml import etree#need install
|
|
||||||
import json
|
|
||||||
import ADC_function
|
|
||||||
|
|
||||||
def getTitle_fc2com(htmlcode): #获取标题
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getActor_fc2com(htmlcode):
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[5]/a/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getStudio_fc2com(htmlcode): #获取厂商
|
|
||||||
try:
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
def getNum_fc2com(htmlcode): #获取番号
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
title = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']")
|
|
||||||
num = title.split(' ')[0]
|
|
||||||
if num.startswith('FC2') != True:
|
|
||||||
num = ''
|
|
||||||
return num
|
|
||||||
def getRelease_fc2com(htmlcode2): #
|
|
||||||
return ''
|
|
||||||
def getCover_fc2com(htmlcode2): #获取img #
|
|
||||||
html = etree.fromstring(htmlcode2, etree.HTMLParser())
|
|
||||||
imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']")
|
|
||||||
imgUrl = imgUrl.replace('../','https://fc2club.net/')
|
|
||||||
return imgUrl
|
|
||||||
def getTag_fc2com(htmlcode): #获取tag
|
|
||||||
html = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
a = html.xpath('//*[@class="show-top-grids"]/div[1]/h5[4]/a')
|
|
||||||
tag = []
|
|
||||||
for i in range(len(a)):
|
|
||||||
tag.append(str(a[i].xpath('text()')).strip("['']"))
|
|
||||||
return tag
|
|
||||||
def getYear_fc2com(release):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']")
|
|
||||||
imgUrl = imgUrl.replace('../','https://fc2club.net/')
|
|
||||||
return imgUrl
|
|
||||||
|
|
||||||
def getTrailer(htmlcode):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
number = number.replace('FC2-', '').replace('fc2-', '')
|
|
||||||
webUrl = 'https://fc2club.net/html/FC2-' + number + '.html'
|
|
||||||
htmlcode2 = ADC_function.get_html(webUrl)
|
|
||||||
actor = getActor_fc2com(htmlcode2)
|
|
||||||
if getActor_fc2com(htmlcode2) == '':
|
|
||||||
actor = 'FC2系列'
|
|
||||||
dic = {
|
|
||||||
'title': getTitle_fc2com(htmlcode2),
|
|
||||||
'studio': getStudio_fc2com(htmlcode2),
|
|
||||||
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
|
|
||||||
'outline': '', # getOutline_fc2com(htmlcode2),
|
|
||||||
'runtime': '',
|
|
||||||
'director': getStudio_fc2com(htmlcode2),
|
|
||||||
'actor': actor,
|
|
||||||
'release': getRelease_fc2com(htmlcode2),
|
|
||||||
'number': 'FC2-' + number,
|
|
||||||
'label': '',
|
|
||||||
'cover': getCover_fc2com(htmlcode2),
|
|
||||||
'extrafanart': getExtrafanart(htmlcode2),
|
|
||||||
"trailer": getTrailer(htmlcode2),
|
|
||||||
'imagecut': 0,
|
|
||||||
'tag': getTag_fc2com(htmlcode2),
|
|
||||||
'actor_photo': '',
|
|
||||||
'website': 'https://fc2club.net/html/FC2-' + number + '.html/',
|
|
||||||
'source': 'https://fc2club.net/html/FC2-' + number + '.html/',
|
|
||||||
'series': '',
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
if ADC_function.config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
print(main('FC2-402422'))
|
|
||||||
|
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
|
|
||||||
from WebCrawler.crawler import *
|
|
||||||
from ADC_function import *
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
save_cookies = False
|
|
||||||
cookie_filename = 'gcolle.json'
|
|
||||||
try:
|
|
||||||
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
|
|
||||||
session = get_html_session(cookies=gcolle_cooikes)
|
|
||||||
number = number.upper().replace('GCOLLE-','')
|
|
||||||
|
|
||||||
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
|
||||||
gcolle_crawler = Crawler(htmlcode)
|
|
||||||
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
|
|
||||||
if r18_continue and r18_continue.startswith('http'):
|
|
||||||
htmlcode = session.get(r18_continue).text
|
|
||||||
gcolle_crawler = Crawler(htmlcode)
|
|
||||||
save_cookies = True
|
|
||||||
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
|
|
||||||
|
|
||||||
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
|
|
||||||
if number != number_html:
|
|
||||||
raise Exception('[-]gcolle.py: number not match')
|
|
||||||
|
|
||||||
if save_cookies:
|
|
||||||
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
|
|
||||||
cookies_save.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
|
|
||||||
|
|
||||||
# get extrafanart url
|
|
||||||
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
|
|
||||||
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
|
|
||||||
else:
|
|
||||||
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
|
|
||||||
# Add "https:" in each extrafanart url
|
|
||||||
for i in range(len(extrafanart)):
|
|
||||||
extrafanart[i] = 'https:' + extrafanart[i]
|
|
||||||
|
|
||||||
dic = {
|
|
||||||
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
|
|
||||||
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
|
||||||
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
|
|
||||||
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
|
|
||||||
"runtime": '',
|
|
||||||
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
|
||||||
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
|
||||||
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
|
|
||||||
"number": "GCOLLE-" + str(number_html),
|
|
||||||
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
|
|
||||||
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
|
|
||||||
"trailer": '',
|
|
||||||
"actor_photo":'',
|
|
||||||
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
|
|
||||||
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
|
|
||||||
"extrafanart":extrafanart,
|
|
||||||
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
|
||||||
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
|
|
||||||
"source": 'gcolle.py',
|
|
||||||
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
|
|
||||||
'无码': False,
|
|
||||||
}
|
|
||||||
# for k,v in dic.items():
|
|
||||||
# if k == 'outline':
|
|
||||||
# print(k,len(v))
|
|
||||||
# else:
|
|
||||||
# print(k,v)
|
|
||||||
# print('===============================================================')
|
|
||||||
except Exception as e:
|
|
||||||
dic = {'title':''}
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
return dic
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
from pprint import pprint
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
pprint(main('840724'))
|
|
||||||
pprint(main('840386'))
|
|
||||||
pprint(main('838671'))
|
|
||||||
pprint(main('814179'))
|
|
||||||
pprint(main('834255'))
|
|
||||||
pprint(main('814179'))
|
|
||||||
@@ -1,133 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.crawler import *
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
JSON_HEADERS = {"Referer": "https://dl.getchu.com/"}
|
|
||||||
COOKIES_DL = {"adult_check_flag": "1"}
|
|
||||||
COOKIES_WWW = {'getchu_adalt_flag': 'getchu.com'}
|
|
||||||
|
|
||||||
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
|
|
||||||
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
|
|
||||||
GETCHU_WWW_URL = 'http://www.getchu.com/soft.phtml?id=_WORD_'
|
|
||||||
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
|
|
||||||
|
|
||||||
def get_dl_getchu(number):
|
|
||||||
if "item" in number or 'GETCHU' in number.upper():
|
|
||||||
number = re.findall('\d+',number)[0]
|
|
||||||
else:
|
|
||||||
htmlcode = get_html(GETCHU_DL_SEARCH_URL.replace("_WORD_", number),
|
|
||||||
json_headers=JSON_HEADERS, cookies=COOKIES_DL)
|
|
||||||
getchu = Crawler(htmlcode)
|
|
||||||
url = getchu.getString(
|
|
||||||
'/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
|
|
||||||
if url == "":
|
|
||||||
return None
|
|
||||||
number = re.findall('\d+', url)[0]
|
|
||||||
htmlcode = get_html(GETCHU_DL_URL.replace("_WORD_", number), json_headers=JSON_HEADERS, cookies=COOKIES_DL)
|
|
||||||
getchu = Crawler(htmlcode)
|
|
||||||
dic = {
|
|
||||||
"title": getchu.getString("//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"),
|
|
||||||
"cover": "https://dl.getchu.com" + getchu.getString("//td[contains(@bgcolor,'#ffffff')]/img/@src"),
|
|
||||||
"director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()").strip(),
|
|
||||||
"studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
|
|
||||||
"actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
|
|
||||||
"label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
|
|
||||||
"runtime": str(re.findall('\d+', str(getchu.getString(
|
|
||||||
"//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()")))).strip(" ['']"),
|
|
||||||
"release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-"),
|
|
||||||
"tag": getchu.getStrings("//td[contains(text(),'趣向')]/following-sibling::td/a/text()"),
|
|
||||||
"outline": getchu.getStrings("//*[contains(text(),'作品内容')]/following-sibling::td/text()"),
|
|
||||||
"extrafanart": getchu.getStrings("//td[contains(@style,'background-color: #444444;')]/a/@href"),
|
|
||||||
"series": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
|
|
||||||
"number": 'GETCHU-' + re.findall('\d+',number)[0],
|
|
||||||
"imagecut": 4,
|
|
||||||
"year": str(re.findall('\d{4}', str(getchu.getString(
|
|
||||||
"//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-")))).strip(" ['']"),
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": "https://dl.getchu.com/i/" + number,
|
|
||||||
"source": "getchu.py",
|
|
||||||
"allow_number_change": True,
|
|
||||||
}
|
|
||||||
extrafanart = []
|
|
||||||
for i in dic['extrafanart']:
|
|
||||||
i = "https://dl.getchu.com" + i
|
|
||||||
extrafanart.append(i)
|
|
||||||
dic['extrafanart'] = extrafanart
|
|
||||||
time.sleep(1)
|
|
||||||
return dic
|
|
||||||
|
|
||||||
def get_www_getchu(number):
|
|
||||||
number = quote(number, encoding="euc_jp")
|
|
||||||
getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
|
|
||||||
url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
|
|
||||||
if url2 == '':
|
|
||||||
getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
|
|
||||||
url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
|
|
||||||
if url2 == "":
|
|
||||||
return None
|
|
||||||
url2 = url2.replace('../', 'http://www.getchu.com/')
|
|
||||||
getchu = Crawler(get_html(url2, cookies=COOKIES_WWW))
|
|
||||||
dic = {
|
|
||||||
"title": getchu.getString('//*[@id="soft-title"]/text()').strip(),
|
|
||||||
"cover": "http://www.getchu.com" + getchu.getString(
|
|
||||||
"/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'),
|
|
||||||
"director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
|
|
||||||
"studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
|
|
||||||
"actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
|
|
||||||
"label": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(),
|
|
||||||
"runtime": '',
|
|
||||||
"release": getchu.getString("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-").strip(),
|
|
||||||
"tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"),
|
|
||||||
"outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"),
|
|
||||||
"extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"),
|
|
||||||
"series": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(),
|
|
||||||
"number": 'GETCHU-' + re.findall('\d+', url2.replace("http://www.getchu.com/soft.phtml?id=", ""))[0],
|
|
||||||
"imagecut": 0,
|
|
||||||
"year": str(re.findall('\d{4}', str(getchu.getString(
|
|
||||||
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"),
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": url2,
|
|
||||||
"headers": {'referer': url2},
|
|
||||||
"source": "getchu.py",
|
|
||||||
"allow_number_change": True,
|
|
||||||
}
|
|
||||||
extrafanart = []
|
|
||||||
for i in dic['extrafanart']:
|
|
||||||
i = "http://www.getchu.com" + i.replace("./", '/')
|
|
||||||
if 'jpg' in i:
|
|
||||||
extrafanart.append(i)
|
|
||||||
dic['extrafanart'] = extrafanart
|
|
||||||
time.sleep(1)
|
|
||||||
return dic
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
number = number.replace("-C", "")
|
|
||||||
dic = {}
|
|
||||||
if "item" in number:
|
|
||||||
sort = ["get_dl_getchu(number)", "get_www_getchu(number)"]
|
|
||||||
else:
|
|
||||||
sort = ["get_www_getchu(number)", "get_dl_getchu(number)"]
|
|
||||||
for i in sort:
|
|
||||||
dic = eval(i)
|
|
||||||
if dic != None:
|
|
||||||
break
|
|
||||||
if dic == None:
|
|
||||||
return {"title" : ""}
|
|
||||||
outline = ''
|
|
||||||
_list = dic['outline']
|
|
||||||
for i in _list:
|
|
||||||
outline = outline + i
|
|
||||||
dic['outline'] = outline
|
|
||||||
|
|
||||||
result = json.dumps(dic,ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )
|
|
||||||
return result
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
test = []
|
|
||||||
for i in test:
|
|
||||||
print(i)
|
|
||||||
print(main(i))
|
|
||||||
@@ -1,185 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
import json
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from lxml import html
|
|
||||||
from ADC_function import post_html
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def main(number: str) -> json:
|
|
||||||
try:
|
|
||||||
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
|
|
||||||
soup = BeautifulSoup(result.text, "html.parser")
|
|
||||||
lx = html.fromstring(str(soup))
|
|
||||||
except:
|
|
||||||
dic = {"title": ""}
|
|
||||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
if "/video/" in result.url:
|
|
||||||
data = parse_info(soup)
|
|
||||||
|
|
||||||
dic = {
|
|
||||||
"title": get_title(lx),
|
|
||||||
"year": get_year(data),
|
|
||||||
"outline": get_outline(lx),
|
|
||||||
"director": "",
|
|
||||||
"cover": get_cover(lx),
|
|
||||||
"imagecut": 1,
|
|
||||||
"trailer": get_trailer(result.text),
|
|
||||||
"extrafanart": get_extrafanart(result.text),
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": result.url,
|
|
||||||
"source": "jav321.py",
|
|
||||||
**data,
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
dic = {"title": ""}
|
|
||||||
|
|
||||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
def get_title(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
|
|
||||||
|
|
||||||
|
|
||||||
def parse_info(soup: BeautifulSoup) -> dict:
|
|
||||||
data = soup.select_one("div.row > div.col-md-9")
|
|
||||||
|
|
||||||
if data:
|
|
||||||
dd = str(data).split("<br/>")
|
|
||||||
data_dic = {}
|
|
||||||
for d in dd:
|
|
||||||
data_dic[get_bold_text(h=d)] = d
|
|
||||||
|
|
||||||
return {
|
|
||||||
"actor": get_actor(data_dic),
|
|
||||||
"label": get_label(data_dic),
|
|
||||||
"studio": get_studio(data_dic),
|
|
||||||
"tag": get_tag(data_dic),
|
|
||||||
"number": get_number(data_dic).upper(),
|
|
||||||
"release": get_release(data_dic),
|
|
||||||
"runtime": get_runtime(data_dic).replace(" minutes", ""),
|
|
||||||
"series": get_series(data_dic),
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {"title": ""}
|
|
||||||
|
|
||||||
|
|
||||||
def get_bold_text(h: str) -> str:
|
|
||||||
soup = BeautifulSoup(h, "html.parser")
|
|
||||||
if soup.b:
|
|
||||||
return soup.b.text
|
|
||||||
else:
|
|
||||||
return "UNKNOWN_TAG"
|
|
||||||
|
|
||||||
|
|
||||||
def get_anchor_info(h: str) -> str:
|
|
||||||
result = []
|
|
||||||
|
|
||||||
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
|
|
||||||
for d in data:
|
|
||||||
result.append(d.text)
|
|
||||||
|
|
||||||
return ",".join(result)
|
|
||||||
|
|
||||||
|
|
||||||
def get_text_info(h: str) -> str:
|
|
||||||
return h.split(": ")[1]
|
|
||||||
|
|
||||||
def get_trailer(html) -> str:
|
|
||||||
videourl_pather = re.compile(r'<source src=\"(.*?)\"')
|
|
||||||
videourl = videourl_pather.findall(html)
|
|
||||||
if videourl:
|
|
||||||
url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp')
|
|
||||||
return url
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def get_extrafanart(htmlcode): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">')
|
|
||||||
html = html_pather.search(htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return extrafanart_imgs
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def get_cover(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
|
|
||||||
|
|
||||||
|
|
||||||
def get_outline(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
|
|
||||||
|
|
||||||
def get_series2(lx: html.HtmlElement) -> str:
|
|
||||||
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
|
|
||||||
|
|
||||||
|
|
||||||
def get_actor(data: hash) -> str:
|
|
||||||
if "出演者" in data:
|
|
||||||
return get_anchor_info(data["出演者"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_label(data: hash) -> str:
|
|
||||||
if "メーカー" in data:
|
|
||||||
return get_anchor_info(data["メーカー"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_tag(data: hash) -> str:
|
|
||||||
if "ジャンル" in data:
|
|
||||||
return get_anchor_info(data["ジャンル"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_studio(data: hash) -> str:
|
|
||||||
if "メーカー" in data:
|
|
||||||
return get_anchor_info(data["メーカー"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_number(data: hash) -> str:
|
|
||||||
if "品番" in data:
|
|
||||||
return get_text_info(data["品番"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_release(data: hash) -> str:
|
|
||||||
if "配信開始日" in data:
|
|
||||||
return get_text_info(data["配信開始日"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_runtime(data: hash) -> str:
|
|
||||||
if "収録時間" in data:
|
|
||||||
return get_text_info(data["収録時間"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_year(data: hash) -> str:
|
|
||||||
if "release" in data:
|
|
||||||
return data["release"][:4]
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_series(data: hash) -> str:
|
|
||||||
if "シリーズ" in data:
|
|
||||||
return get_anchor_info(data["シリーズ"])
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print(main("jul-404"))
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.storyline import getStoryline
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
def getActorPhoto(html):
|
|
||||||
actors = html.xpath('//div[@class="star-name"]/../a/img')
|
|
||||||
d = {}
|
|
||||||
for i in actors:
|
|
||||||
p = i.attrib['src']
|
|
||||||
if "nowprinting.gif" in p:
|
|
||||||
continue
|
|
||||||
t = i.attrib['title']
|
|
||||||
d[t] = urljoin("https://www.javbus.com", p)
|
|
||||||
return d
|
|
||||||
def getTitle(html): #获取标题
|
|
||||||
title = str(html.xpath('/html/head/title/text()')[0])
|
|
||||||
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
|
|
||||||
return title
|
|
||||||
def getStudioJa(html):
|
|
||||||
x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
|
|
||||||
return str(x[0]) if len(x) else ''
|
|
||||||
def getStudio(html): #获取厂商
|
|
||||||
x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
|
|
||||||
return str(x[0]) if len(x) else ''
|
|
||||||
def getYear(html): #获取年份
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
|
|
||||||
return result[:4] if len(result)>=len('2000-01-01') else ''
|
|
||||||
def getCover(html): #获取封面链接
|
|
||||||
image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
|
|
||||||
return urljoin("https://www.javbus.com", image)
|
|
||||||
def getRelease(html): #获取出版日期
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
|
|
||||||
return result
|
|
||||||
def getRuntime(html): #获取分钟 已修改
|
|
||||||
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
|
|
||||||
return result
|
|
||||||
def getActor(html): #获取女优
|
|
||||||
b=[]
|
|
||||||
actors = html.xpath('//div[@class="star-name"]/a')
|
|
||||||
for i in actors:
|
|
||||||
b.append(i.attrib['title'])
|
|
||||||
return b
|
|
||||||
def getNum(html): #获取番号
|
|
||||||
kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
|
|
||||||
return kwdlist[0]
|
|
||||||
def getDirectorJa(html):
|
|
||||||
x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
|
|
||||||
return str(x[0]) if len(x) else ''
|
|
||||||
def getDirector(html): #获取导演
|
|
||||||
x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
|
|
||||||
return str(x[0]) if len(x) else ''
|
|
||||||
def getCID(html):
|
|
||||||
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
|
|
||||||
result = re.sub('/.*?.jpg','',string)
|
|
||||||
return result
|
|
||||||
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
|
|
||||||
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
|
|
||||||
return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度
|
|
||||||
return getStoryline(number,title, 无码=uncensored)
|
|
||||||
def getSeriseJa(html):
|
|
||||||
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
|
|
||||||
return str(x[0]) if len(x) else ''
|
|
||||||
def getSerise(html): #获取系列
|
|
||||||
x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
|
|
||||||
return str(x[0]) if len(x) else ''
|
|
||||||
def getTag(html): # 获取标签
|
|
||||||
klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
|
|
||||||
return klist[1:]
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
|
|
||||||
html = html_pather.search(htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
|
|
||||||
return ''
|
|
||||||
def getUncensored(html):
|
|
||||||
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
|
|
||||||
return bool(x)
|
|
||||||
|
|
||||||
def main_uncensored(number):
|
|
||||||
w_number = number.replace('.', '-')
|
|
||||||
htmlcode = get_html('https://www.javbus.red/' + w_number)
|
|
||||||
if "<title>404 Page Not Found" in htmlcode:
|
|
||||||
return {"title": ""}
|
|
||||||
lx = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
title = getTitle(lx)
|
|
||||||
dic = {
|
|
||||||
'title': title,
|
|
||||||
'studio': getStudioJa(lx),
|
|
||||||
'year': getYear(lx),
|
|
||||||
'outline': getOutline(w_number, title, True),
|
|
||||||
'runtime': getRuntime(lx),
|
|
||||||
'director': getDirectorJa(lx),
|
|
||||||
'actor': getActor(lx),
|
|
||||||
'release': getRelease(lx),
|
|
||||||
'number': getNum(lx),
|
|
||||||
'cover': getCover(lx),
|
|
||||||
'tag': getTag(lx),
|
|
||||||
'extrafanart': getExtrafanart(htmlcode),
|
|
||||||
'label': getSeriseJa(lx),
|
|
||||||
'imagecut': 0,
|
|
||||||
'actor_photo': getActorPhoto(lx),
|
|
||||||
'website': 'https://www.javbus.red/' + w_number,
|
|
||||||
'source': 'javbus.py',
|
|
||||||
'series': getSeriseJa(lx),
|
|
||||||
'无码': True
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
url = "https://www." + secrets.choice([
|
|
||||||
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
|
|
||||||
'cdnbus.fun',
|
|
||||||
'dmmbus.fun', 'dmmsee.fun',
|
|
||||||
'fanbus.us',
|
|
||||||
'seedmm.fun',
|
|
||||||
]) + "/"
|
|
||||||
try:
|
|
||||||
htmlcode = get_html(url + number)
|
|
||||||
except:
|
|
||||||
htmlcode = get_html('https://www.javbus.com/' + number)
|
|
||||||
if "<title>404 Page Not Found" in htmlcode:
|
|
||||||
return {"title": ""}
|
|
||||||
lx = etree.fromstring(htmlcode,etree.HTMLParser())
|
|
||||||
title = getTitle(lx)
|
|
||||||
dic = {
|
|
||||||
'title': title,
|
|
||||||
'studio': getStudio(lx),
|
|
||||||
'year': getYear(lx),
|
|
||||||
'outline': getOutline(number, title, getUncensored(lx)),
|
|
||||||
'runtime': getRuntime(lx),
|
|
||||||
'director': getDirector(lx),
|
|
||||||
'actor': getActor(lx),
|
|
||||||
'release': getRelease(lx),
|
|
||||||
'number': getNum(lx),
|
|
||||||
'cover': getCover(lx),
|
|
||||||
'imagecut': 1,
|
|
||||||
'tag': getTag(lx),
|
|
||||||
'extrafanart': getExtrafanart(htmlcode),
|
|
||||||
'label': getSerise(lx),
|
|
||||||
'actor_photo': getActorPhoto(lx),
|
|
||||||
'website': 'https://www.javbus.com/' + number,
|
|
||||||
'source': 'javbus.py',
|
|
||||||
'series': getSerise(lx),
|
|
||||||
'无码': getUncensored(lx)
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
except:
|
|
||||||
return main_uncensored(number)
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
)
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == "__main__" :
|
|
||||||
config.getInstance().set_override("storyline:switch=0")
|
|
||||||
config.getInstance().set_override("actor_photo:download_for_kodi=1")
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
print(main('STAR-438'))
|
|
||||||
print(main('ABP-960'))
|
|
||||||
print(main('ADV-R0624')) # 404
|
|
||||||
print(main('MMNT-010'))
|
|
||||||
print(main('ipx-292'))
|
|
||||||
print(main('CEMD-011'))
|
|
||||||
print(main('CJOD-278'))
|
|
||||||
print(main('BrazzersExxtra.21.02.01'))
|
|
||||||
print(main('100221_001'))
|
|
||||||
print(main('AVSW-061'))
|
|
||||||
@@ -1,321 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.storyline import getStoryline
|
|
||||||
|
|
||||||
def getTitle(html):
|
|
||||||
browser_title = str(html.xpath("/html/head/title/text()")[0])
|
|
||||||
return browser_title[:browser_title.find(' | JavDB')].strip()
|
|
||||||
|
|
||||||
def getActor(html):
|
|
||||||
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
|
|
||||||
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
|
|
||||||
r = []
|
|
||||||
idx = 0
|
|
||||||
actor_gendor = config.getInstance().actor_gender()
|
|
||||||
if not actor_gendor in ['female','male','both','all']:
|
|
||||||
actor_gendor = 'female'
|
|
||||||
for act in actors:
|
|
||||||
if((actor_gendor == 'all')
|
|
||||||
or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male'])
|
|
||||||
or (actor_gendor == 'female' and genders[idx] == 'symbol female')
|
|
||||||
or (actor_gendor == 'male' and genders[idx] == 'symbol male')):
|
|
||||||
r.append(act)
|
|
||||||
idx = idx + 1
|
|
||||||
return r
|
|
||||||
|
|
||||||
def getaphoto(url, session):
|
|
||||||
html_page = session.get(url).text
|
|
||||||
img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page)
|
|
||||||
return img_url[0] if img_url else ''
|
|
||||||
|
|
||||||
def getActorPhoto(html, javdb_site, session):
|
|
||||||
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
|
|
||||||
if not actorall:
|
|
||||||
return {}
|
|
||||||
a = getActor(html)
|
|
||||||
actor_photo = {}
|
|
||||||
if not session:
|
|
||||||
session = get_html_session()
|
|
||||||
for i in actorall:
|
|
||||||
x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A)
|
|
||||||
if not len(x) or not len(x[0]) or i.text not in a:
|
|
||||||
continue
|
|
||||||
actor_id = x[0]
|
|
||||||
pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg"
|
|
||||||
if not session.head(pic_url).ok:
|
|
||||||
pic_url = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session)
|
|
||||||
if len(pic_url):
|
|
||||||
actor_photo[i.text] = pic_url
|
|
||||||
return actor_photo
|
|
||||||
|
|
||||||
def getStudio(a, html):
|
|
||||||
patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
|
|
||||||
pianshang = patherr.findall(a)
|
|
||||||
if pianshang:
|
|
||||||
result = pianshang[0].strip()
|
|
||||||
if len(result):
|
|
||||||
return result
|
|
||||||
# 以卖家作为工作室
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
|
|
||||||
except:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getRuntime(html):
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').rstrip('mi')
|
|
||||||
def getLabel(html):
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def getNum(html):
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result2 + result1).strip('+')
|
|
||||||
def getYear(getRelease):
|
|
||||||
patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)\-.*?</span>')
|
|
||||||
dates = patherr.findall(getRelease)
|
|
||||||
if dates:
|
|
||||||
result = dates[0]
|
|
||||||
else:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getRelease(a):
|
|
||||||
patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)</span>')
|
|
||||||
dates = patherr.findall(a)
|
|
||||||
if dates:
|
|
||||||
result = dates[0]
|
|
||||||
else:
|
|
||||||
result = ''
|
|
||||||
return result
|
|
||||||
def getTag(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
|
|
||||||
return result
|
|
||||||
|
|
||||||
except:
|
|
||||||
result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getCover_small(html, index=0):
|
|
||||||
# same issue mentioned below,
|
|
||||||
# javdb sometime returns multiple results
|
|
||||||
# DO NOT just get the firt one, get the one with correct index number
|
|
||||||
try:
|
|
||||||
result = html.xpath("//*[contains(@class,'movie-list')]/div/a/div[contains(@class, 'cover')]/img/@src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
except: # 2020.7.17 Repair Cover Url crawl
|
|
||||||
try:
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
result = html.xpath("//div[@class='item-image']/img/@data-src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getTrailer(htmlcode): # 获取预告片
|
|
||||||
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
|
|
||||||
video = video_pather.findall(htmlcode)
|
|
||||||
# 加上数组判空
|
|
||||||
if video and video[0] != "":
|
|
||||||
if not 'https:' in video[0]:
|
|
||||||
video_url = 'https:' + video[0]
|
|
||||||
else:
|
|
||||||
video_url = video[0]
|
|
||||||
else:
|
|
||||||
video_url = ''
|
|
||||||
return video_url
|
|
||||||
|
|
||||||
def getExtrafanart(html): # 获取剧照
|
|
||||||
result = []
|
|
||||||
try:
|
|
||||||
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return result
|
|
||||||
def getCover(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
|
|
||||||
except: # 2020.7.17 Repair Cover Url crawl
|
|
||||||
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
|
|
||||||
return result
|
|
||||||
def getDirector(html):
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
|
|
||||||
return getStoryline(number, title, 无码=uncensored)
|
|
||||||
def getSeries(html):
|
|
||||||
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
|
|
||||||
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
|
|
||||||
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
|
||||||
def getUserRating(html):
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
|
|
||||||
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
|
||||||
return float(v[0][0]), int(v[0][1])
|
|
||||||
except:
|
|
||||||
return
|
|
||||||
def getUncensored(html):
|
|
||||||
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
|
|
||||||
' or contains(@href,"/tags/western?")]')
|
|
||||||
return bool(x)
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
# javdb更新后同一时间只能登录一个数字站,最新登录站会踢出旧的登录,因此按找到的第一个javdb*.json文件选择站点,
|
|
||||||
# 如果无.json文件或者超过有效期,则随机选择一个站点。
|
|
||||||
javdb_sites = config.getInstance().javdb_sites().split(',')
|
|
||||||
debug = config.getInstance().debug()
|
|
||||||
for i in javdb_sites:
|
|
||||||
javdb_sites[javdb_sites.index(i)] = "javdb" + i
|
|
||||||
javdb_sites.append("javdb")
|
|
||||||
try:
|
|
||||||
# if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
|
|
||||||
# pass
|
|
||||||
# else:
|
|
||||||
# number = number.upper()
|
|
||||||
number = number.upper()
|
|
||||||
javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
|
|
||||||
# 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天
|
|
||||||
has_json = False
|
|
||||||
for cj in javdb_sites:
|
|
||||||
javdb_site = cj
|
|
||||||
cookie_json = javdb_site + '.json'
|
|
||||||
cookies_dict, cookies_filepath = load_cookies(cookie_json)
|
|
||||||
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
|
|
||||||
cdays = file_modification_days(cookies_filepath)
|
|
||||||
if cdays < 7:
|
|
||||||
javdb_cookies = cookies_dict
|
|
||||||
has_json = True
|
|
||||||
break
|
|
||||||
elif cdays != 9999:
|
|
||||||
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
|
|
||||||
if not has_json:
|
|
||||||
javdb_site = secrets.choice(javdb_sites)
|
|
||||||
if debug:
|
|
||||||
print(f'[!]javdb:select site {javdb_site}')
|
|
||||||
session = None
|
|
||||||
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
|
|
||||||
try:
|
|
||||||
if debug:
|
|
||||||
raise # try get_html_by_scraper() branch
|
|
||||||
res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
|
|
||||||
if not res:
|
|
||||||
raise
|
|
||||||
query_result = res.text
|
|
||||||
except:
|
|
||||||
res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper')
|
|
||||||
if not res:
|
|
||||||
raise ValueError('page not found')
|
|
||||||
query_result = res.text
|
|
||||||
if session is None:
|
|
||||||
raise ValueError('page not found')
|
|
||||||
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
# javdb sometime returns multiple results,
|
|
||||||
# and the first elememt maybe not the one we are looking for
|
|
||||||
# iterate all candidates and find the match one
|
|
||||||
urls = html.xpath('//*[contains(@class,"movie-list")]/div/a/@href')
|
|
||||||
# 记录一下欧美的ids ['Blacked','Blacked']
|
|
||||||
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
|
|
||||||
correct_url = urls[0]
|
|
||||||
else:
|
|
||||||
ids = html.xpath('//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
|
|
||||||
try:
|
|
||||||
correct_url = urls[ids.index(number)]
|
|
||||||
except:
|
|
||||||
# 为避免获得错误番号,只要精确对应的结果
|
|
||||||
if ids[0].upper() != number:
|
|
||||||
raise ValueError("number not found")
|
|
||||||
correct_url = urls[0]
|
|
||||||
try:
|
|
||||||
# get faster benefit from http keep-alive
|
|
||||||
javdb_detail_url = urljoin(res.url, correct_url)
|
|
||||||
detail_page = session.get(javdb_detail_url).text
|
|
||||||
except:
|
|
||||||
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
|
|
||||||
session = None
|
|
||||||
|
|
||||||
# etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用
|
|
||||||
lx = etree.fromstring(detail_page, etree.HTMLParser())
|
|
||||||
imagecut = 1
|
|
||||||
dp_number = getNum(lx)
|
|
||||||
if dp_number.upper() != number.upper():
|
|
||||||
raise ValueError("number not eq"+dp_number)
|
|
||||||
title = getTitle(lx)
|
|
||||||
if title and dp_number:
|
|
||||||
number = dp_number
|
|
||||||
# remove duplicate title
|
|
||||||
title = title.replace(number, '').strip()
|
|
||||||
dic = {
|
|
||||||
'actor': getActor(lx),
|
|
||||||
'title': title,
|
|
||||||
'studio': getStudio(detail_page, lx),
|
|
||||||
'outline': getOutline(number, title, getUncensored(lx)),
|
|
||||||
'runtime': getRuntime(lx),
|
|
||||||
'director': getDirector(lx),
|
|
||||||
'release': getRelease(detail_page),
|
|
||||||
'number': number,
|
|
||||||
'cover': getCover(lx),
|
|
||||||
'trailer': getTrailer(detail_page),
|
|
||||||
'extrafanart': getExtrafanart(lx),
|
|
||||||
'imagecut': imagecut,
|
|
||||||
'tag': getTag(lx),
|
|
||||||
'label': getLabel(lx),
|
|
||||||
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'website': urljoin('https://javdb.com', correct_url),
|
|
||||||
'source': 'javdb.py',
|
|
||||||
'series': getSeries(lx),
|
|
||||||
'无码': getUncensored(lx)
|
|
||||||
}
|
|
||||||
userrating = getUserRating(lx)
|
|
||||||
if isinstance(userrating, tuple) and len(userrating) == 2:
|
|
||||||
dic['用户评分'] = userrating[0]
|
|
||||||
dic['评分人数'] = userrating[1]
|
|
||||||
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
|
|
||||||
dic['actor'].append('素人')
|
|
||||||
if not dic['series']:
|
|
||||||
dic['series'] = dic['studio']
|
|
||||||
if not dic['label']:
|
|
||||||
dic['label'] = dic['studio']
|
|
||||||
if config.getInstance().download_actor_photo_for_kodi():
|
|
||||||
dic['actor_photo'] = getActorPhoto(lx, javdb_site, session)
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
if debug:
|
|
||||||
print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
# main('DV-1562')
|
|
||||||
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
|
|
||||||
if __name__ == "__main__":
|
|
||||||
config.getInstance().set_override("storyline:switch=0")
|
|
||||||
config.getInstance().set_override("actor_photo:download_for_kodi=1")
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
# print(main('blacked.20.05.30'))
|
|
||||||
print(main('AGAV-042'))
|
|
||||||
print(main('BANK-022'))
|
|
||||||
print(main('070116-197'))
|
|
||||||
print(main('093021_539')) # 没有剧照 片商pacopacomama
|
|
||||||
#print(main('FC2-2278260'))
|
|
||||||
# print(main('FC2-735670'))
|
|
||||||
# print(main('FC2-1174949')) # not found
|
|
||||||
print(main('MVSD-439'))
|
|
||||||
# print(main('EHM0001')) # not found
|
|
||||||
#print(main('FC2-2314275'))
|
|
||||||
print(main('EBOD-646'))
|
|
||||||
print(main('LOVE-262'))
|
|
||||||
print(main('ABP-890'))
|
|
||||||
print(main('blacked.14.12.08'))
|
|
||||||
@@ -1,161 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
import json
|
|
||||||
import bs4
|
|
||||||
import re
|
|
||||||
from WebCrawler import airav
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from lxml import html
|
|
||||||
from http.cookies import SimpleCookie
|
|
||||||
|
|
||||||
from ADC_function import get_javlib_cookie, get_html
|
|
||||||
|
|
||||||
|
|
||||||
def main(number: str):
|
|
||||||
raw_cookies, user_agent = get_javlib_cookie()
|
|
||||||
|
|
||||||
# Blank cookies mean javlib site return error
|
|
||||||
if not raw_cookies:
|
|
||||||
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
# Manually construct a dictionary
|
|
||||||
s_cookie = SimpleCookie()
|
|
||||||
s_cookie.load(raw_cookies)
|
|
||||||
cookies = {}
|
|
||||||
for key, morsel in s_cookie.items():
|
|
||||||
cookies[key] = morsel.value
|
|
||||||
|
|
||||||
# Scraping
|
|
||||||
result = get_html(
|
|
||||||
"http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
|
|
||||||
cookies=cookies,
|
|
||||||
ua=user_agent,
|
|
||||||
return_type="object"
|
|
||||||
)
|
|
||||||
soup = BeautifulSoup(result.text, "html.parser")
|
|
||||||
lx = html.fromstring(str(soup))
|
|
||||||
|
|
||||||
fanhao_pather = re.compile(r'<a href=".*?".*?><div class="id">(.*?)</div>')
|
|
||||||
fanhao = fanhao_pather.findall(result.text)
|
|
||||||
|
|
||||||
if "/?v=jav" in result.url:
|
|
||||||
dic = {
|
|
||||||
"title": get_title(lx, soup),
|
|
||||||
"studio": get_table_el_single_anchor(soup, "video_maker"),
|
|
||||||
"year": get_table_el_td(soup, "video_date")[:4],
|
|
||||||
"outline": get_outline(number),
|
|
||||||
"director": get_table_el_single_anchor(soup, "video_director"),
|
|
||||||
"cover": get_cover(lx),
|
|
||||||
"imagecut": 1,
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": result.url,
|
|
||||||
"source": "javlib.py",
|
|
||||||
"actor": get_table_el_multi_anchor(soup, "video_cast"),
|
|
||||||
"label": get_table_el_td(soup, "video_label"),
|
|
||||||
"tag": get_table_el_multi_anchor(soup, "video_genres"),
|
|
||||||
"number": get_table_el_td(soup, "video_id"),
|
|
||||||
"release": get_table_el_td(soup, "video_date"),
|
|
||||||
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
|
|
||||||
"series":'',
|
|
||||||
}
|
|
||||||
elif number.upper() in fanhao:
|
|
||||||
url_pather = re.compile(r'<a href="(.*?)".*?><div class="id">(.*?)</div>')
|
|
||||||
s = {}
|
|
||||||
url_list = url_pather.findall(result.text)
|
|
||||||
for url in url_list:
|
|
||||||
s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.')
|
|
||||||
av_url = s[number.upper()]
|
|
||||||
result = get_html(
|
|
||||||
av_url,
|
|
||||||
cookies=cookies,
|
|
||||||
ua=user_agent,
|
|
||||||
return_type="object"
|
|
||||||
)
|
|
||||||
soup = BeautifulSoup(result.text, "html.parser")
|
|
||||||
lx = html.fromstring(str(soup))
|
|
||||||
|
|
||||||
dic = {
|
|
||||||
"title": get_title(lx, soup),
|
|
||||||
"studio": get_table_el_single_anchor(soup, "video_maker"),
|
|
||||||
"year": get_table_el_td(soup, "video_date")[:4],
|
|
||||||
"outline": get_outline(number),
|
|
||||||
"director": get_table_el_single_anchor(soup, "video_director"),
|
|
||||||
"cover": get_cover(lx),
|
|
||||||
"imagecut": 1,
|
|
||||||
"actor_photo": "",
|
|
||||||
"website": result.url,
|
|
||||||
"source": "javlib.py",
|
|
||||||
"actor": get_table_el_multi_anchor(soup, "video_cast"),
|
|
||||||
"label": get_table_el_td(soup, "video_label"),
|
|
||||||
"tag": get_table_el_multi_anchor(soup, "video_genres"),
|
|
||||||
"number": get_table_el_td(soup, "video_id"),
|
|
||||||
"release": get_table_el_td(soup, "video_date"),
|
|
||||||
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
|
|
||||||
"series": '',
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
dic = {"title": ""}
|
|
||||||
|
|
||||||
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
|
|
||||||
|
|
||||||
|
|
||||||
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
|
|
||||||
return lx.xpath(xpath)[0].strip()
|
|
||||||
|
|
||||||
|
|
||||||
def get_outline(number):
|
|
||||||
try:
|
|
||||||
response = json.loads(airav.main(number))
|
|
||||||
result = response['outline']
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
|
||||||
tag = soup.find(id=tag_id).find("a")
|
|
||||||
|
|
||||||
if tag is not None:
|
|
||||||
return tag.string.strip()
|
|
||||||
else:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
|
|
||||||
tags = soup.find(id=tag_id).find_all("a")
|
|
||||||
|
|
||||||
return process(tags)
|
|
||||||
|
|
||||||
|
|
||||||
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
|
|
||||||
tags = soup.find(id=tag_id).find_all("td", class_="text")
|
|
||||||
|
|
||||||
return process(tags)
|
|
||||||
|
|
||||||
|
|
||||||
def process(tags: bs4.element.ResultSet) -> str:
|
|
||||||
values = []
|
|
||||||
for tag in tags:
|
|
||||||
value = tag.string
|
|
||||||
if value is not None and value != "----":
|
|
||||||
values.append(value)
|
|
||||||
|
|
||||||
return ",".join(x for x in values if x)
|
|
||||||
|
|
||||||
|
|
||||||
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
|
|
||||||
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
|
|
||||||
number = get_table_el_td(soup, "video_id")
|
|
||||||
|
|
||||||
return title.replace(number, "").strip()
|
|
||||||
|
|
||||||
|
|
||||||
def get_cover(lx: html.HtmlComment) -> str:
|
|
||||||
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"]
|
|
||||||
#lists = ["DVMC-003"]
|
|
||||||
for num in lists:
|
|
||||||
print(main(num))
|
|
||||||
@@ -1,173 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from lib2to3.pgen2 import parse
|
|
||||||
|
|
||||||
from urllib.parse import urlparse, unquote
|
|
||||||
|
|
||||||
|
|
||||||
def getActorPhoto(html):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getTitle(html): # 获取标题
|
|
||||||
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
|
|
||||||
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
|
|
||||||
# <title>MD0094/贫嘴贱舌中出大嫂/坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
|
|
||||||
# <title>TM0002-我的痴女女友-麻豆社</title>
|
|
||||||
browser_title = str(html.xpath("/html/head/title/text()")[0])
|
|
||||||
title = str(re.findall(r'^[A-Z0-9 //\-]*(.*)-麻豆社$', browser_title)[0]).strip()
|
|
||||||
return title
|
|
||||||
|
|
||||||
def getStudio(html): # 获取厂商 已修改
|
|
||||||
try:
|
|
||||||
category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
|
|
||||||
return category.strip()
|
|
||||||
except:
|
|
||||||
return '麻豆社'
|
|
||||||
|
|
||||||
|
|
||||||
def getYear(html): # 获取年份
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getCover(htmlcode): # 获取封面图片
|
|
||||||
try:
|
|
||||||
url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0])
|
|
||||||
return url.strip()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getRelease(html): # 获取出版日期
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getRuntime(html): # 获取播放时长
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getUrl(html):
|
|
||||||
return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
|
|
||||||
|
|
||||||
|
|
||||||
def getNum(url, number): # 获取番号
|
|
||||||
try:
|
|
||||||
# 解码url
|
|
||||||
filename = unquote(urlparse(url).path)
|
|
||||||
# 裁剪文件名
|
|
||||||
result = filename[1:-5].upper().strip()
|
|
||||||
# 移除中文
|
|
||||||
if result.upper() != number.upper():
|
|
||||||
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
|
|
||||||
# 移除多余的符号
|
|
||||||
return result.strip('-')
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getDirector(html): # 获取导演 已修改
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getOutline(html): # 获取概述
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getSerise(html): # 获取系列 已修改
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getTag(html, studio): # 获取标签
|
|
||||||
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
|
|
||||||
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
|
|
||||||
|
|
||||||
|
|
||||||
def getExtrafanart(html): # 获取剧照
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def cutTags(tags):
|
|
||||||
actors = []
|
|
||||||
tags = []
|
|
||||||
for tag in tags:
|
|
||||||
actors.append(tag)
|
|
||||||
return actors,tags
|
|
||||||
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
number = number.lower().strip()
|
|
||||||
url = "https://madou.club/" + number + ".html"
|
|
||||||
htmlcode = get_html(url)
|
|
||||||
except:
|
|
||||||
# print(number)
|
|
||||||
pass
|
|
||||||
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
url = getUrl(html)
|
|
||||||
studio = getStudio(html)
|
|
||||||
tags = getTag(html, studio)
|
|
||||||
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定,放弃尝试获取
|
|
||||||
actor = ''
|
|
||||||
dic = {
|
|
||||||
# 标题
|
|
||||||
'title': getTitle(html),
|
|
||||||
# 制作商
|
|
||||||
'studio': studio,
|
|
||||||
# 年份
|
|
||||||
'year': getYear(html),
|
|
||||||
# 简介
|
|
||||||
'outline': getOutline(html),
|
|
||||||
#
|
|
||||||
'runtime': getRuntime(html),
|
|
||||||
# 导演
|
|
||||||
'director': getDirector(html),
|
|
||||||
# 演员
|
|
||||||
'actor': actor,
|
|
||||||
# 发售日
|
|
||||||
'release': getRelease(html),
|
|
||||||
# 番号
|
|
||||||
'number': getNum(url, number),
|
|
||||||
# 封面链接
|
|
||||||
'cover': getCover(htmlcode),
|
|
||||||
# 剧照获取
|
|
||||||
'extrafanart': getExtrafanart(html),
|
|
||||||
'imagecut': 1,
|
|
||||||
#
|
|
||||||
'tag': tags,
|
|
||||||
#
|
|
||||||
'label': getSerise(html),
|
|
||||||
# 作者图片
|
|
||||||
'website': url,
|
|
||||||
'source': 'madou.py',
|
|
||||||
# 使用
|
|
||||||
'series': getSerise(html),
|
|
||||||
'无码': True
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
|
|
||||||
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
)
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
print(main('MD0129'))
|
|
||||||
# print(main('TM0002'))
|
|
||||||
# print(main('MD0222'))
|
|
||||||
# print(main('MD0140-2'))
|
|
||||||
# print(main('MAD039'))
|
|
||||||
# print(main('JDMY027'))
|
|
||||||
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.crawler import *
|
|
||||||
|
|
||||||
class MgsCrawler(Crawler):
|
|
||||||
def getMgsString(self, _xpath):
|
|
||||||
html = self.html
|
|
||||||
result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
|
|
||||||
result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
|
||||||
|
|
||||||
def getTag(a):
|
|
||||||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
|
|
||||||
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
|
|
||||||
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
|
|
||||||
return result
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode2): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
|
|
||||||
html = html_pather.search(htmlcode2)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
return extrafanart_imgs
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def main(number2):
|
|
||||||
number=number2.upper()
|
|
||||||
htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
|
|
||||||
soup = BeautifulSoup(htmlcode2, 'lxml')
|
|
||||||
a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
|
||||||
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
|
|
||||||
htmlcode = MgsCrawler(htmlcode2)
|
|
||||||
a = MgsCrawler(a2)
|
|
||||||
b = MgsCrawler(b2)
|
|
||||||
#print(b)
|
|
||||||
dic = {
|
|
||||||
'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
|
|
||||||
'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
|
|
||||||
'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
|
|
||||||
'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
|
|
||||||
'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
|
|
||||||
'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
|
|
||||||
'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
|
|
||||||
'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
|
|
||||||
'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
|
|
||||||
'imagecut': 1,
|
|
||||||
'tag': getTag(a2),
|
|
||||||
'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
|
|
||||||
'extrafanart': getExtrafanart(htmlcode2),
|
|
||||||
'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
|
|
||||||
# str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'actor_photo': '',
|
|
||||||
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
|
|
||||||
'source': 'mgstage.py',
|
|
||||||
'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
|
|
||||||
}
|
|
||||||
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
print(main('SIRO-4149'))
|
|
||||||
@@ -1,154 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
|
|
||||||
|
|
||||||
host = 'https://www.91mv.org'
|
|
||||||
|
|
||||||
def getActorPhoto(html):
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTitle(html): #获取标题
|
|
||||||
try:
|
|
||||||
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
|
|
||||||
result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
|
|
||||||
return result.strip()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getStudio(html): #获取厂商 已修改
|
|
||||||
return '91制片厂'
|
|
||||||
|
|
||||||
def getYear(html): #获取年份
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//p[@class="date"]/text()')[0])
|
|
||||||
date = result.replace('日期:','')
|
|
||||||
if isinstance(date, str) and len(date):
|
|
||||||
return date
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getCover(htmlcode): #获取封面图片
|
|
||||||
try:
|
|
||||||
url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
|
|
||||||
return url.strip()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getRelease(html): #获取出版日期
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//p[@class="date"]/text()')[0])
|
|
||||||
date = result.replace('日期:','')
|
|
||||||
if isinstance(date, str) and len(date):
|
|
||||||
return date
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getRuntime(htmlcode): #获取播放时长
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getActor(html): #获取女优
|
|
||||||
b=[]
|
|
||||||
for player in html.xpath('//p[@class="player-name"]/text()'):
|
|
||||||
player = player.replace('主演:','')
|
|
||||||
b.append(player)
|
|
||||||
return b
|
|
||||||
|
|
||||||
def getNum(html): #获取番号
|
|
||||||
try:
|
|
||||||
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
|
|
||||||
result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
|
|
||||||
return result.strip()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getDirector(html): #获取导演 已修改
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getOutline(html): #获取概述
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//div[@class="play-text"]/text()')[0])
|
|
||||||
return result.strip()
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getSerise(htmlcode): #获取系列 已修改
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getTag(html): # 获取标签
|
|
||||||
return html.xpath('//div[@class="player-tag"]/text()')
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def search(keyword): #搜索,返回结果
|
|
||||||
search_html = get_html(host + '/index/search?keywords=' + keyword)
|
|
||||||
html = etree.fromstring(search_html, etree.HTMLParser())
|
|
||||||
return html.xpath('//a[@class="video-list"]/@href')[0]
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
number = number.replace('91CM-','').replace('91MS-','')
|
|
||||||
url = host + str(search(number))
|
|
||||||
htmlcode = get_html(url)
|
|
||||||
except:
|
|
||||||
# print(number)
|
|
||||||
pass
|
|
||||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
|
||||||
dic = {
|
|
||||||
# 标题
|
|
||||||
'title': getTitle(html),
|
|
||||||
# 制作商
|
|
||||||
'studio': getStudio(html),
|
|
||||||
# 年份
|
|
||||||
'year': getYear(html),
|
|
||||||
# 简介
|
|
||||||
'outline': getOutline(html),
|
|
||||||
#
|
|
||||||
'runtime': getRuntime(html),
|
|
||||||
# 导演
|
|
||||||
'director': getDirector(html),
|
|
||||||
# 演员
|
|
||||||
'actor': getActor(html),
|
|
||||||
# 发售日
|
|
||||||
'release': getRelease(html),
|
|
||||||
# 番号
|
|
||||||
'number': getNum(html),
|
|
||||||
# 封面链接
|
|
||||||
'cover': getCover(htmlcode),
|
|
||||||
# 剧照获取
|
|
||||||
'extrafanart': getExtrafanart(html),
|
|
||||||
'imagecut': 1,
|
|
||||||
#
|
|
||||||
'tag': getTag(html),
|
|
||||||
#
|
|
||||||
'label': getSerise(html),
|
|
||||||
# 作者图片
|
|
||||||
'website': url,
|
|
||||||
'source': 'mv91.py',
|
|
||||||
# 使用
|
|
||||||
'series': getSerise(html)
|
|
||||||
}
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
data = {
|
|
||||||
"title": "",
|
|
||||||
}
|
|
||||||
js = json.dumps(
|
|
||||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
|
||||||
)
|
|
||||||
return js
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
print(main('91CM-121'))
|
|
||||||
print(main('91CM-122'))
|
|
||||||
print(main('91CM-143'))
|
|
||||||
print(main('91MS-006'))
|
|
||||||
@@ -1,220 +0,0 @@
|
|||||||
import sys
|
|
||||||
sys.path.append('../')
|
|
||||||
from ADC_function import *
|
|
||||||
from WebCrawler.storyline import getStoryline
|
|
||||||
|
|
||||||
def getTitle(html):
|
|
||||||
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getActor(browser):
|
|
||||||
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
|
||||||
t = []
|
|
||||||
for i in htmla:
|
|
||||||
t.append(i.text.strip())
|
|
||||||
return t
|
|
||||||
|
|
||||||
|
|
||||||
def getActorPhoto(browser):
|
|
||||||
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
|
||||||
t = {i.text.strip(): i['href'] for i in htmla}
|
|
||||||
o = {}
|
|
||||||
for k, v in t.items():
|
|
||||||
r = browser.open_relative(v)
|
|
||||||
if not r.ok:
|
|
||||||
continue
|
|
||||||
pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
|
|
||||||
if 'noimage.gif' in pic['src']:
|
|
||||||
continue
|
|
||||||
o[k] = urljoin(browser.url, pic['src'])
|
|
||||||
return o
|
|
||||||
|
|
||||||
|
|
||||||
def getStudio(html):
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
|
|
||||||
except:
|
|
||||||
result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
|
|
||||||
return result.strip('+').replace("', '", '').replace('"', '')
|
|
||||||
|
|
||||||
|
|
||||||
def getRuntime(html):
|
|
||||||
try:
|
|
||||||
x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
|
|
||||||
return x
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getLabel(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getNum(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="hinban"]/text()')[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getYear(getRelease):
|
|
||||||
try:
|
|
||||||
result = str(re.search('\d{4}', getRelease).group())
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return getRelease
|
|
||||||
|
|
||||||
|
|
||||||
def getRelease(html):
|
|
||||||
try:
|
|
||||||
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
try:
|
|
||||||
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getTag(html):
|
|
||||||
result = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
|
|
||||||
total = []
|
|
||||||
for i in result:
|
|
||||||
total.append(i.replace("\n","").replace("\t",""))
|
|
||||||
return total
|
|
||||||
|
|
||||||
|
|
||||||
def getCover_small(html, index=0):
|
|
||||||
# same issue mentioned below,
|
|
||||||
# javdb sometime returns multiple results
|
|
||||||
# DO NOT just get the firt one, get the one with correct index number
|
|
||||||
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
|
|
||||||
if not 'https' in result:
|
|
||||||
result = 'https:' + result
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def getCover(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
|
|
||||||
return 'https:' + result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getDirector(html):
|
|
||||||
try:
|
|
||||||
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def getOutline(html, number, title):
|
|
||||||
storyline_site = config.getInstance().storyline_site().split(',')
|
|
||||||
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
|
|
||||||
if len(a):
|
|
||||||
site = [n for n in storyline_site if n in a]
|
|
||||||
g = getStoryline(number, title, site, 无码=False)
|
|
||||||
if len(g):
|
|
||||||
return g
|
|
||||||
try:
|
|
||||||
x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
|
|
||||||
return x.replace(getNum(html), '')
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getSeries(html):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0]
|
|
||||||
return result
|
|
||||||
except:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getExtrafanart(htmlcode): # 获取剧照
|
|
||||||
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
|
|
||||||
html = html_pather.search(htmlcode)
|
|
||||||
if html:
|
|
||||||
html = html.group()
|
|
||||||
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
|
|
||||||
extrafanart_imgs = extrafanart_pather.findall(html)
|
|
||||||
if extrafanart_imgs:
|
|
||||||
s = []
|
|
||||||
for urli in extrafanart_imgs:
|
|
||||||
urli = 'https:' + urli.replace('/scene/small', '')
|
|
||||||
s.append(urli)
|
|
||||||
return s
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def open_by_browser(number):
|
|
||||||
xcity_number = number.replace('-','')
|
|
||||||
query_result, browser = get_html_by_form(
|
|
||||||
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
|
|
||||||
fields = {'q' : xcity_number.lower()},
|
|
||||||
return_type = 'browser')
|
|
||||||
if not query_result or not query_result.ok:
|
|
||||||
raise ValueError("xcity.py: page not found")
|
|
||||||
result = browser.follow_link(browser.links('avod\/detail')[0])
|
|
||||||
if not result.ok:
|
|
||||||
raise ValueError("xcity.py: detail page not found")
|
|
||||||
return str(browser.page), browser
|
|
||||||
|
|
||||||
def main(number):
|
|
||||||
try:
|
|
||||||
detail_page, browser = open_by_browser(number)
|
|
||||||
url = browser.url
|
|
||||||
lx = etree.fromstring(detail_page, etree.HTMLParser())
|
|
||||||
newnum = getNum(lx).upper()
|
|
||||||
number_up = number.upper()
|
|
||||||
if newnum != number_up:
|
|
||||||
if newnum == number.replace('-','').upper():
|
|
||||||
newnum = number_up
|
|
||||||
else:
|
|
||||||
raise ValueError("xcity.py: number not found")
|
|
||||||
title = getTitle(lx)
|
|
||||||
dic = {
|
|
||||||
'actor': getActor(browser),
|
|
||||||
'title': title,
|
|
||||||
'studio': getStudio(lx),
|
|
||||||
'outline': getOutline(lx, number, title),
|
|
||||||
'runtime': getRuntime(lx),
|
|
||||||
'director': getDirector(lx),
|
|
||||||
'release': getRelease(lx),
|
|
||||||
'number': newnum,
|
|
||||||
'cover': getCover(lx),
|
|
||||||
'cover_small': '',
|
|
||||||
'extrafanart': getExtrafanart(detail_page),
|
|
||||||
'imagecut': 1,
|
|
||||||
'tag': getTag(lx),
|
|
||||||
'label': getLabel(lx),
|
|
||||||
'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()),
|
|
||||||
'website': url,
|
|
||||||
'source': 'xcity.py',
|
|
||||||
'series': getSeries(lx),
|
|
||||||
}
|
|
||||||
if config.getInstance().download_actor_photo_for_kodi():
|
|
||||||
dic['actor_photo'] = getActorPhoto(browser)
|
|
||||||
except Exception as e:
|
|
||||||
if config.getInstance().debug():
|
|
||||||
print(e)
|
|
||||||
dic = {"title": ""}
|
|
||||||
|
|
||||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
|
||||||
return js
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
config.getInstance().set_override("storyline:switch=0")
|
|
||||||
config.getInstance().set_override("actor_photo:download_for_kodi=1")
|
|
||||||
config.getInstance().set_override("debug_mode:switch=1")
|
|
||||||
print(main('RCTD-288'))
|
|
||||||
print(main('VNDS-2624'))
|
|
||||||
print(main('ABP-345'))
|
|
||||||
3
core.py
3
core.py
@@ -14,7 +14,8 @@ from datetime import datetime
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from ADC_function import *
|
from ADC_function import *
|
||||||
from WebCrawler import get_data_from_json
|
# from WebCrawler import get_data_from_json
|
||||||
|
from scraper import get_data_from_json
|
||||||
from number_parser import is_uncensored
|
from number_parser import is_uncensored
|
||||||
from ImageProcessing import cutImage
|
from ImageProcessing import cutImage
|
||||||
|
|
||||||
|
|||||||
@@ -1,45 +1,11 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import secrets
|
||||||
from multiprocessing.pool import ThreadPool
|
|
||||||
|
|
||||||
import ADC_function
|
|
||||||
import config
|
import config
|
||||||
from ADC_function import translate
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# =========website========
|
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
|
||||||
from . import airav
|
from scrapinglib.api import search
|
||||||
from . import avsox
|
|
||||||
from . import fanza
|
|
||||||
from . import fc2
|
|
||||||
from . import jav321
|
|
||||||
from . import javbus
|
|
||||||
from . import javdb
|
|
||||||
from . import mgstage
|
|
||||||
from . import xcity
|
|
||||||
# from . import javlib
|
|
||||||
from . import dlsite
|
|
||||||
from . import carib
|
|
||||||
from . import fc2club
|
|
||||||
from . import mv91
|
|
||||||
from . import madou
|
|
||||||
from . import gcolle
|
|
||||||
from . import getchu
|
|
||||||
|
|
||||||
|
|
||||||
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
|
||||||
if "title" not in data or "number" not in data:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if data["title"] is None or data["title"] == "" or data["title"] == "null":
|
|
||||||
return False
|
|
||||||
|
|
||||||
if data["number"] is None or data["number"] == "" or data["number"] == "null":
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def get_data_from_json(file_number, oCC):
|
def get_data_from_json(file_number, oCC):
|
||||||
"""
|
"""
|
||||||
@@ -49,116 +15,45 @@ def get_data_from_json(file_number, oCC):
|
|||||||
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
|
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
|
||||||
info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
|
info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
|
||||||
|
|
||||||
func_mapping = {
|
|
||||||
"airav": airav.main,
|
|
||||||
"avsox": avsox.main,
|
|
||||||
"fc2": fc2.main,
|
|
||||||
"fanza": fanza.main,
|
|
||||||
"javdb": javdb.main,
|
|
||||||
"javbus": javbus.main,
|
|
||||||
"mgstage": mgstage.main,
|
|
||||||
"jav321": jav321.main,
|
|
||||||
"xcity": xcity.main,
|
|
||||||
# "javlib": javlib.main,
|
|
||||||
"dlsite": dlsite.main,
|
|
||||||
"carib": carib.main,
|
|
||||||
"fc2club": fc2club.main,
|
|
||||||
"mv91": mv91.main,
|
|
||||||
"madou": madou.main,
|
|
||||||
"gcolle": gcolle.main,
|
|
||||||
"getchu": getchu.main,
|
|
||||||
}
|
|
||||||
|
|
||||||
conf = config.getInstance()
|
conf = config.getInstance()
|
||||||
# default fetch order list, from the beginning to the end
|
# default fetch order list, from the beginning to the end
|
||||||
sources = conf.sources().split(',')
|
sources = conf.sources()
|
||||||
def insert(sources,source):
|
|
||||||
if source in sources:
|
|
||||||
sources.insert(0, sources.pop(sources.index(source)))
|
|
||||||
return sources
|
|
||||||
|
|
||||||
if len(sources) <= len(func_mapping):
|
# TODO 准备参数
|
||||||
# if the input file name matches certain rules,
|
# - 清理 ADC_function, webcrawler
|
||||||
# move some web service to the beginning of the list
|
proxies = None
|
||||||
lo_file_number = file_number.lower()
|
configProxy = conf.proxy()
|
||||||
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
|
if configProxy.enable:
|
||||||
):
|
proxies = configProxy.proxies()
|
||||||
sources = insert(sources,"carib")
|
|
||||||
elif "item" in file_number or "GETCHU" in file_number.upper():
|
javdb_sites = conf.javdb_sites().split(',')
|
||||||
sources = insert(sources,"getchu")
|
for i in javdb_sites:
|
||||||
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
|
javdb_sites[javdb_sites.index(i)] = "javdb" + i
|
||||||
sources = insert(sources, "getchu")
|
javdb_sites.append("javdb")
|
||||||
sources = insert(sources, "dlsite")
|
# 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天
|
||||||
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
has_json = False
|
||||||
if "avsox" in sources:
|
for cj in javdb_sites:
|
||||||
sources = insert(sources,"avsox")
|
javdb_site = cj
|
||||||
elif "mgstage" in sources and \
|
cookie_json = javdb_site + '.json'
|
||||||
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
cookies_dict, cookies_filepath = load_cookies(cookie_json)
|
||||||
sources = insert(sources,"mgstage")
|
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
|
||||||
elif "fc2" in lo_file_number:
|
cdays = file_modification_days(cookies_filepath)
|
||||||
if "fc2" in sources:
|
if cdays < 7:
|
||||||
sources = insert(sources,"fc2")
|
javdb_cookies = cookies_dict
|
||||||
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
has_json = True
|
||||||
sources = insert(sources,"gcolle")
|
|
||||||
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
|
|
||||||
if "xcity" in sources:
|
|
||||||
sources = insert(sources,"xcity")
|
|
||||||
if "madou" in sources:
|
|
||||||
sources = insert(sources,"madou")
|
|
||||||
elif "madou" in sources and (
|
|
||||||
re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
|
|
||||||
):
|
|
||||||
sources = insert(sources,"madou")
|
|
||||||
|
|
||||||
# check sources in func_mapping
|
|
||||||
todel = []
|
|
||||||
for s in sources:
|
|
||||||
if not s in func_mapping:
|
|
||||||
print('[!] Source Not Exist : ' + s)
|
|
||||||
todel.append(s)
|
|
||||||
for d in todel:
|
|
||||||
print('[!] Remove Source : ' + s)
|
|
||||||
sources.remove(d)
|
|
||||||
|
|
||||||
json_data = {}
|
|
||||||
|
|
||||||
if conf.multi_threading():
|
|
||||||
pool = ThreadPool(processes=len(conf.sources().split(',')))
|
|
||||||
|
|
||||||
# Set the priority of multi-thread crawling and join the multi-thread queue
|
|
||||||
for source in sources:
|
|
||||||
pool.apply_async(func_mapping[source], (file_number,))
|
|
||||||
|
|
||||||
# Get multi-threaded crawling response
|
|
||||||
for source in sources:
|
|
||||||
if conf.debug() == True:
|
|
||||||
print('[+]select', source)
|
|
||||||
try:
|
|
||||||
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
|
|
||||||
except:
|
|
||||||
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
|
|
||||||
# if any service return a valid return, break
|
|
||||||
if get_data_state(json_data):
|
|
||||||
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
|
|
||||||
break
|
break
|
||||||
pool.close()
|
elif cdays != 9999:
|
||||||
pool.terminate()
|
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
|
||||||
else:
|
if not has_json:
|
||||||
for source in sources:
|
javdb_site = secrets.choice(javdb_sites)
|
||||||
try:
|
javdb_cookies = None
|
||||||
if conf.debug() == True:
|
|
||||||
print('[+]select', source)
|
|
||||||
try:
|
|
||||||
json_data = json.loads(func_mapping[source](file_number))
|
|
||||||
except:
|
|
||||||
json_data = func_mapping[source](file_number)
|
|
||||||
# if any service return a valid return, break
|
|
||||||
if get_data_state(json_data):
|
|
||||||
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
cacert =None
|
||||||
|
if conf.cacert_file():
|
||||||
|
cacert = conf.cacert_file()
|
||||||
|
json_data = search(file_number, sources, proxies=proxies, verify=cacert,
|
||||||
|
dbsite=javdb_site, dbcookies=javdb_cookies,
|
||||||
|
morestoryline=conf.is_storyline())
|
||||||
# Return if data not found in all sources
|
# Return if data not found in all sources
|
||||||
if not json_data:
|
if not json_data:
|
||||||
print('[-]Movie Number not found!')
|
print('[-]Movie Number not found!')
|
||||||
@@ -316,26 +211,26 @@ def get_data_from_json(file_number, oCC):
|
|||||||
try:
|
try:
|
||||||
if ccm == 1:
|
if ccm == 1:
|
||||||
json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
|
json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
|
||||||
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
|
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
|
||||||
elif ccm == 2:
|
elif ccm == 2:
|
||||||
json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
|
json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
|
||||||
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
|
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
|
||||||
elif ccm == 3:
|
elif ccm == 3:
|
||||||
json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
|
json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
|
||||||
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
|
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
|
||||||
except:
|
except:
|
||||||
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
|
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
if ccm == 1:
|
if ccm == 1:
|
||||||
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
|
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
|
||||||
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
|
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
|
||||||
elif ccm == 2:
|
elif ccm == 2:
|
||||||
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
|
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
|
||||||
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
|
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
|
||||||
elif ccm == 3:
|
elif ccm == 3:
|
||||||
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
|
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
|
||||||
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
|
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
|
||||||
except IndexError:
|
except IndexError:
|
||||||
json_data[cc] = oCC.convert(json_data[cc])
|
json_data[cc] = oCC.convert(json_data[cc])
|
||||||
except:
|
except:
|
||||||
3
scrapinglib/__init__.py
Normal file
3
scrapinglib/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from .api import search
|
||||||
114
scrapinglib/airav.py
Normal file
114
scrapinglib/airav.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from .parser import Parser
|
||||||
|
from .javbus import Javbus
|
||||||
|
|
||||||
|
class Airav(Parser):
|
||||||
|
source = 'airav'
|
||||||
|
|
||||||
|
expr_title = '/html/head/title/text()'
|
||||||
|
expr_number = '/html/head/title/text()'
|
||||||
|
expr_studio = '//a[contains(@href,"?video_factory=")]/text()'
|
||||||
|
expr_release = '//li[contains(text(),"發片日期")]/text()'
|
||||||
|
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
|
||||||
|
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
|
||||||
|
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
|
||||||
|
expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
|
||||||
|
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number
|
||||||
|
self.detailurl = 'https://cn.airav.wiki/video/' + number
|
||||||
|
engine = Javbus()
|
||||||
|
javbusinfo = engine.scrape(number, self)
|
||||||
|
if javbusinfo == 404:
|
||||||
|
self.javbus = {"title": ""}
|
||||||
|
else:
|
||||||
|
self.javbus = json.loads(javbusinfo)
|
||||||
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
|
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
# return super().getNum(htmltree)
|
||||||
|
result = self.javbus.get('number')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
number = super().getNum(htmltree)
|
||||||
|
result = str(re.findall('^\[(.*?)]', number)[0])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
title = super().getTitle(htmltree)
|
||||||
|
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
result = self.javbus.get('studio')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
return super().getStudio(htmltree)
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
result = self.javbus.get('release')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
try:
|
||||||
|
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getYear(self, htmltree):
|
||||||
|
result = self.javbus.get('year')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
release = self.getRelease(htmltree)
|
||||||
|
return str(re.findall('\d{4}', release)).strip(" ['']")
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
result = self.javbus.get('runtime')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getDirector(self, htmltree):
|
||||||
|
result = self.javbus.get('director')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
b=[]
|
||||||
|
a = super().getActors(htmltree)
|
||||||
|
for v in a:
|
||||||
|
v = v.strip()
|
||||||
|
if len(v):
|
||||||
|
b.append(v)
|
||||||
|
if len(b):
|
||||||
|
return b
|
||||||
|
result = self.javbus.get('actor')
|
||||||
|
if isinstance(result, list) and len(result):
|
||||||
|
return result
|
||||||
|
return []
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
result = self.javbus.get('cover')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
return super().getCover(htmltree)
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getSeries(self, htmltree):
|
||||||
|
result = self.javbus.get('series')
|
||||||
|
if isinstance(result, str) and len(result):
|
||||||
|
return result
|
||||||
|
return ''
|
||||||
225
scrapinglib/api.py
Normal file
225
scrapinglib/api.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .airav import Airav
|
||||||
|
from .carib import Carib
|
||||||
|
from .dlsite import Dlsite
|
||||||
|
from .fanza import Fanza
|
||||||
|
from .gcolle import Gcolle
|
||||||
|
from .getchu import Getchu
|
||||||
|
from .jav321 import Jav321
|
||||||
|
from .javdb import Javdb
|
||||||
|
from .mv91 import Mv91
|
||||||
|
from .fc2 import Fc2
|
||||||
|
from .madou import Madou
|
||||||
|
from .mgstage import Mgstage
|
||||||
|
from .javbus import Javbus
|
||||||
|
from .xcity import Xcity
|
||||||
|
from .avsox import Avsox
|
||||||
|
|
||||||
|
from .tmdb import Tmdb
|
||||||
|
|
||||||
|
|
||||||
|
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
|
||||||
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
|
""" 根据``番号/电影``名搜索信息
|
||||||
|
|
||||||
|
:param number: number/name depends on type
|
||||||
|
:param sources: sources string with `,` like ``avsox,javbus``
|
||||||
|
:param type: ``adult``, ``general``
|
||||||
|
"""
|
||||||
|
sc = Scraping()
|
||||||
|
return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
|
||||||
|
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
|
||||||
|
|
||||||
|
class Scraping():
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
|
||||||
|
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
|
||||||
|
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
|
||||||
|
'gcolle', 'javdb', 'getchu']
|
||||||
|
adult_func_mapping = {
|
||||||
|
'avsox': Avsox().scrape,
|
||||||
|
'javbus': Javbus().scrape,
|
||||||
|
'xcity': Xcity().scrape,
|
||||||
|
'mgstage': Mgstage().scrape,
|
||||||
|
'madou': Madou().scrape,
|
||||||
|
'fc2': Fc2().scrape,
|
||||||
|
'dlsite': Dlsite().scrape,
|
||||||
|
'jav321': Jav321().scrape,
|
||||||
|
'fanza': Fanza().scrape,
|
||||||
|
'airav': Airav().scrape,
|
||||||
|
'carib': Carib().scrape,
|
||||||
|
'mv91': Mv91().scrape,
|
||||||
|
'gcolle': Gcolle().scrape,
|
||||||
|
'javdb': Javdb().scrape,
|
||||||
|
'getchu': Getchu().scrape,
|
||||||
|
}
|
||||||
|
|
||||||
|
general_full_sources = ['tmdb']
|
||||||
|
general_func_mapping = {
|
||||||
|
'tmdb': Tmdb().scrape,
|
||||||
|
}
|
||||||
|
|
||||||
|
proxies = None
|
||||||
|
verify = None
|
||||||
|
|
||||||
|
dbcookies = None
|
||||||
|
dbsite = None
|
||||||
|
# 使用storyline方法进一步获取故事情节
|
||||||
|
morestoryline = False
|
||||||
|
|
||||||
|
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
|
||||||
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
|
self.proxies = proxies
|
||||||
|
self.verify = verify
|
||||||
|
self.dbcookies = dbcookies
|
||||||
|
self.dbsite = dbsite
|
||||||
|
self.morestoryline = morestoryline
|
||||||
|
if type == 'adult':
|
||||||
|
return self.searchAdult(number, sources)
|
||||||
|
else:
|
||||||
|
return self.searchGeneral(number, sources)
|
||||||
|
|
||||||
|
def searchGeneral(self, name, sources):
|
||||||
|
""" 查询电影电视剧
|
||||||
|
imdb,tmdb
|
||||||
|
"""
|
||||||
|
sources = self.checkGeneralSources(sources, name)
|
||||||
|
json_data = {}
|
||||||
|
for source in sources:
|
||||||
|
try:
|
||||||
|
print('[+]select', source)
|
||||||
|
try:
|
||||||
|
data = self.general_func_mapping[source](name, self)
|
||||||
|
if data == 404:
|
||||||
|
continue
|
||||||
|
json_data = json.loads(data)
|
||||||
|
except Exception as e:
|
||||||
|
print('[!] 出错啦')
|
||||||
|
print(e)
|
||||||
|
# if any service return a valid return, break
|
||||||
|
if self.get_data_state(json_data):
|
||||||
|
print(f"[+]Find movie [{name}] metadata on website '{source}'")
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Return if data not found in all sources
|
||||||
|
if not json_data:
|
||||||
|
print(f'[-]Movie Number [{name}] not found!')
|
||||||
|
return None
|
||||||
|
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
def searchAdult(self, number, sources):
|
||||||
|
sources = self.checkAdultSources(sources, number)
|
||||||
|
json_data = {}
|
||||||
|
for source in sources:
|
||||||
|
try:
|
||||||
|
print('[+]select', source)
|
||||||
|
try:
|
||||||
|
data = self.adult_func_mapping[source](number, self)
|
||||||
|
if data == 404:
|
||||||
|
continue
|
||||||
|
json_data = json.loads(data)
|
||||||
|
except Exception as e:
|
||||||
|
print('[!] 出错啦')
|
||||||
|
print(e)
|
||||||
|
# json_data = self.func_mapping[source](number, self)
|
||||||
|
# if any service return a valid return, break
|
||||||
|
if self.get_data_state(json_data):
|
||||||
|
print(f"[+]Find movie [{number}] metadata on website '{source}'")
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Return if data not found in all sources
|
||||||
|
if not json_data:
|
||||||
|
print(f'[-]Movie Number [{number}] not found!')
|
||||||
|
return None
|
||||||
|
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
def checkGeneralSources(self, c_sources, name):
|
||||||
|
if not c_sources:
|
||||||
|
sources = self.general_full_sources
|
||||||
|
else:
|
||||||
|
sources = c_sources.split(',')
|
||||||
|
|
||||||
|
# check sources in func_mapping
|
||||||
|
todel = []
|
||||||
|
for s in sources:
|
||||||
|
if not s in self.general_func_mapping:
|
||||||
|
print('[!] Source Not Exist : ' + s)
|
||||||
|
todel.append(s)
|
||||||
|
for d in todel:
|
||||||
|
print('[!] Remove Source : ' + s)
|
||||||
|
sources.remove(d)
|
||||||
|
return sources
|
||||||
|
|
||||||
|
def checkAdultSources(self, c_sources, file_number):
|
||||||
|
if not c_sources:
|
||||||
|
sources = self.adult_full_sources
|
||||||
|
else:
|
||||||
|
sources = c_sources.split(',')
|
||||||
|
def insert(sources,source):
|
||||||
|
if source in sources:
|
||||||
|
sources.insert(0, sources.pop(sources.index(source)))
|
||||||
|
return sources
|
||||||
|
|
||||||
|
if len(sources) <= len(self.adult_func_mapping):
|
||||||
|
# if the input file name matches certain rules,
|
||||||
|
# move some web service to the beginning of the list
|
||||||
|
lo_file_number = file_number.lower()
|
||||||
|
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
|
||||||
|
):
|
||||||
|
sources = insert(sources,"carib")
|
||||||
|
elif "item" in file_number or "GETCHU" in file_number.upper():
|
||||||
|
sources = insert(sources,"getchu")
|
||||||
|
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
|
||||||
|
sources = insert(sources, "getchu")
|
||||||
|
sources = insert(sources, "dlsite")
|
||||||
|
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
|
||||||
|
if "avsox" in sources:
|
||||||
|
sources = insert(sources,"avsox")
|
||||||
|
elif "mgstage" in sources and \
|
||||||
|
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
|
||||||
|
sources = insert(sources,"mgstage")
|
||||||
|
elif "fc2" in lo_file_number:
|
||||||
|
if "fc2" in sources:
|
||||||
|
sources = insert(sources,"fc2")
|
||||||
|
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
|
||||||
|
sources = insert(sources,"gcolle")
|
||||||
|
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
|
||||||
|
if "xcity" in sources:
|
||||||
|
sources = insert(sources,"xcity")
|
||||||
|
if "madou" in sources:
|
||||||
|
sources = insert(sources,"madou")
|
||||||
|
elif "madou" in sources and (
|
||||||
|
re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
|
||||||
|
):
|
||||||
|
sources = insert(sources,"madou")
|
||||||
|
|
||||||
|
# check sources in func_mapping
|
||||||
|
todel = []
|
||||||
|
for s in sources:
|
||||||
|
if not s in self.adult_func_mapping:
|
||||||
|
print('[!] Source Not Exist : ' + s)
|
||||||
|
todel.append(s)
|
||||||
|
for d in todel:
|
||||||
|
print('[!] Remove Source : ' + s)
|
||||||
|
sources.remove(d)
|
||||||
|
return sources
|
||||||
|
|
||||||
|
def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测
|
||||||
|
if "title" not in data or "number" not in data:
|
||||||
|
return False
|
||||||
|
if data["title"] is None or data["title"] == "" or data["title"] == "null":
|
||||||
|
return False
|
||||||
|
if data["number"] is None or data["number"] == "" or data["number"] == "null":
|
||||||
|
return False
|
||||||
|
return True
|
||||||
80
scrapinglib/avsox.py
Normal file
80
scrapinglib/avsox.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Avsox(Parser):
|
||||||
|
|
||||||
|
source = 'avsox'
|
||||||
|
imagecut = 3
|
||||||
|
|
||||||
|
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
|
||||||
|
expr_actor = '//a[@class="avatar-box"]'
|
||||||
|
expr_actorphoto = '//a[@class="avatar-box"]'
|
||||||
|
expr_title = '/html/body/div[2]/h3/text()'
|
||||||
|
expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()'
|
||||||
|
expr_release = '//span[contains(text(),"发行时间:")]/../text()'
|
||||||
|
expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src'
|
||||||
|
expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src'
|
||||||
|
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||||
|
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
|
||||||
|
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
|
||||||
|
site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
|
||||||
|
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
|
||||||
|
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||||
|
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||||
|
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
|
||||||
|
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||||
|
if result1 == '' or result1 == 'null' or result1 == 'None':
|
||||||
|
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
|
||||||
|
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
|
||||||
|
return "https:" + result1
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
new_number = self.getTreeElement(htmltree, self.expr_number)
|
||||||
|
if new_number.upper() != self.number.upper():
|
||||||
|
raise ValueError('number not found in ' + self.source)
|
||||||
|
self.number = new_number
|
||||||
|
return new_number
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
return super().getTitle(htmltree).replace('/', '').strip(self.number)
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
return super().getStudio(htmltree).replace("', '", ' ')
|
||||||
|
|
||||||
|
def getSmallCover(self, htmltree):
|
||||||
|
""" 使用搜索页面的预览小图
|
||||||
|
"""
|
||||||
|
return self.getTreeElement(self.searchtree, self.expr_smallcover)
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
tags = super().getTags(htmltree).split(',')
|
||||||
|
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
if self.morestoryline:
|
||||||
|
from .storyline import getStoryline
|
||||||
|
return getStoryline(self.number)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
a = super().getActors(htmltree)
|
||||||
|
d = []
|
||||||
|
for i in a:
|
||||||
|
d.append(i.find('span').text)
|
||||||
|
return d
|
||||||
|
|
||||||
|
def getActorPhoto(self, htmltree):
|
||||||
|
a = super().getActorPhoto(htmltree)
|
||||||
|
d = {}
|
||||||
|
for i in a:
|
||||||
|
l = i.find('.//img').attrib['src']
|
||||||
|
t = i.find('span').text
|
||||||
|
p2 = {t: l}
|
||||||
|
d.update(p2)
|
||||||
|
return d
|
||||||
99
scrapinglib/carib.py
Normal file
99
scrapinglib/carib.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from lxml import html
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Carib(Parser):
|
||||||
|
source = 'carib'
|
||||||
|
uncensored = True
|
||||||
|
|
||||||
|
expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()"
|
||||||
|
expr_release = "//li[2]/span[@class='spec-content']/text()"
|
||||||
|
expr_runtime = "//span[@class='spec-content']/span[@itemprop='duration']/text()"
|
||||||
|
expr_actor = "//span[@class='spec-content']/a[@itemprop='actor']/span/text()"
|
||||||
|
expr_tags = "//span[@class='spec-content']/a[@itemprop='genre']/text()"
|
||||||
|
expr_extrafanart = "//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href"
|
||||||
|
expr_label = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
|
||||||
|
expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
|
||||||
|
expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()"
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number
|
||||||
|
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
|
||||||
|
htmlcode = self.getHtml(self.detailurl)
|
||||||
|
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:
|
||||||
|
return 404
|
||||||
|
htmltree = html.fromstring(htmlcode)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
return '加勒比'
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
r = []
|
||||||
|
actors = super().getActors(htmltree)
|
||||||
|
for act in actors:
|
||||||
|
if str(act) != '他':
|
||||||
|
r.append(act)
|
||||||
|
return r
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return self.number
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg'
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
r = []
|
||||||
|
genres = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
|
for g in genres:
|
||||||
|
jpg = str(g)
|
||||||
|
if '/member/' in jpg:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
r.append('https://www.caribbeancom.com' + jpg)
|
||||||
|
return r
|
||||||
|
|
||||||
|
def getActorPhoto(self, htmltree):
|
||||||
|
# return super().getActorPhoto(htmltree)
|
||||||
|
htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
|
||||||
|
names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
|
||||||
|
t = {}
|
||||||
|
for name, a in zip(names, htmla):
|
||||||
|
if name.strip() == '他':
|
||||||
|
continue
|
||||||
|
p = {name.strip(): a.attrib['href']}
|
||||||
|
t.update(p)
|
||||||
|
o = {}
|
||||||
|
for k, v in t.items():
|
||||||
|
if '/search_act/' not in v:
|
||||||
|
continue
|
||||||
|
r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object')
|
||||||
|
if not r.ok:
|
||||||
|
continue
|
||||||
|
html = r.text
|
||||||
|
pos = html.find('.full-bg')
|
||||||
|
if pos<0:
|
||||||
|
continue
|
||||||
|
css = html[pos:pos+100]
|
||||||
|
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
|
||||||
|
if not cssBGjpgs or not len(cssBGjpgs[0]):
|
||||||
|
continue
|
||||||
|
p = {k: urljoin(r.url, cssBGjpgs[0])}
|
||||||
|
o.update(p)
|
||||||
|
return o
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
from .storyline import getStoryline
|
||||||
|
result = getStoryline(self.number, uncensored=self.uncensored)
|
||||||
|
if len(result):
|
||||||
|
return result
|
||||||
|
return super().getOutline(htmltree)
|
||||||
|
|
||||||
97
scrapinglib/dlsite.py
Normal file
97
scrapinglib/dlsite.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Dlsite(Parser):
|
||||||
|
source = 'dlsite'
|
||||||
|
imagecut = 4
|
||||||
|
allow_number_change = True
|
||||||
|
|
||||||
|
expr_title = '/html/head/title/text()'
|
||||||
|
expr_actor = '//th[contains(text(),"声优")]/../td/a/text()'
|
||||||
|
expr_studio = '//th[contains(text(),"商标名")]/../td/span[1]/a/text()'
|
||||||
|
expr_studio2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
|
||||||
|
expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
|
||||||
|
expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
|
||||||
|
expr_outline = '//*[@class="work_parts_area"]/p/text()'
|
||||||
|
expr_series = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()'
|
||||||
|
expr_series2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
|
||||||
|
expr_director = '//th[contains(text(),"剧情")]/../td/a/text()'
|
||||||
|
expr_release = '//th[contains(text(),"贩卖日")]/../td/a/text()'
|
||||||
|
expr_cover = '//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset'
|
||||||
|
expr_tags = '//th[contains(text(),"分类")]/../td/div/a/text()'
|
||||||
|
expr_label = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()'
|
||||||
|
expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
|
||||||
|
expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src'
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.cookies = {'locale': 'zh-cn'}
|
||||||
|
if "RJ" in number or "VJ" in number:
|
||||||
|
self.number = number.upper()
|
||||||
|
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
|
||||||
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
else:
|
||||||
|
self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie'
|
||||||
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
|
if len(search_result) == 0:
|
||||||
|
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
|
||||||
|
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
||||||
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
|
if len(search_result) == 0:
|
||||||
|
if "~" in number:
|
||||||
|
number = number.replace("~","〜")
|
||||||
|
elif "〜" in number:
|
||||||
|
number = number.replace("〜","~")
|
||||||
|
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
||||||
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
|
if len(search_result) == 0:
|
||||||
|
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
|
||||||
|
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
|
||||||
|
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
|
||||||
|
self.detailurl = search_result[0]
|
||||||
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
|
||||||
|
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return self.number
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
result = super().getTitle(htmltree)
|
||||||
|
result = result[:result.rfind(' | DLsite')]
|
||||||
|
result = result[:result.rfind(' [')]
|
||||||
|
if 'OFF】' in result:
|
||||||
|
result = result[result.find('】')+1:]
|
||||||
|
result = result.replace('【HD版】', '')
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
total = []
|
||||||
|
result = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
|
for i in result:
|
||||||
|
total.append(i.strip('\r\n'))
|
||||||
|
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return super().getRelease(htmltree).replace('年','-').replace('月','-').replace('日','')
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg')
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = []
|
||||||
|
for i in self.getTreeAll(self.expr_extrafanart):
|
||||||
|
result.append("https:" + i)
|
||||||
|
except:
|
||||||
|
result = ''
|
||||||
|
return result
|
||||||
130
scrapinglib/fanza.py
Normal file
130
scrapinglib/fanza.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Fanza(Parser):
|
||||||
|
source = 'fanza'
|
||||||
|
|
||||||
|
expr_title = '//*[starts-with(@id, "title")]/text()'
|
||||||
|
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
|
||||||
|
expr_cover = '//head/meta[@property="og:image"]'
|
||||||
|
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
|
||||||
|
expr_outline = "//div[@class='mg-b20 lh4']/text()"
|
||||||
|
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
|
||||||
|
expr_outline_og = '//head/meta[@property="og:description"]'
|
||||||
|
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number
|
||||||
|
# fanza allow letter + number + underscore, normalize the input here
|
||||||
|
# @note: I only find the usage of underscore as h_test123456789
|
||||||
|
fanza_search_number = number
|
||||||
|
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
|
||||||
|
if fanza_search_number.startswith("h-"):
|
||||||
|
fanza_search_number = fanza_search_number.replace("h-", "h_")
|
||||||
|
|
||||||
|
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
|
||||||
|
|
||||||
|
fanza_urls = [
|
||||||
|
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
|
||||||
|
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
|
||||||
|
]
|
||||||
|
|
||||||
|
for url in fanza_urls:
|
||||||
|
self.detailurl = url + fanza_search_number
|
||||||
|
url = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl})
|
||||||
|
self.htmlcode = self.getHtml(url)
|
||||||
|
if self.htmlcode != 404:
|
||||||
|
self.htmltree = etree.HTML(self.htmlcode)
|
||||||
|
break
|
||||||
|
if self.htmlcode == 404:
|
||||||
|
return 404
|
||||||
|
result = self.dictformat(self.htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
# for some old page, the input number does not match the page
|
||||||
|
# for example, the url will be cid=test012
|
||||||
|
# but the hinban on the page is test00012
|
||||||
|
# so get the hinban first, and then pass it to following functions
|
||||||
|
self.fanza_hinban = self.getFanzaString('品番:')
|
||||||
|
self.number = self.fanza_hinban
|
||||||
|
number_lo = self.number.lower()
|
||||||
|
if (re.sub('-|_', '', number_lo) == self.fanza_hinban or
|
||||||
|
number_lo.replace('-', '00') == self.fanza_hinban or
|
||||||
|
number_lo.replace('-', '') + 'so' == self.fanza_hinban
|
||||||
|
):
|
||||||
|
self.number = self.number
|
||||||
|
return self.number
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
return self.getFanzaString('メーカー')
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
|
||||||
|
if result == '':
|
||||||
|
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
|
||||||
|
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
|
||||||
|
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
|
||||||
|
return result
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
return str(re.search(r'\d+', super().getRuntime(htmltree)).group()).strip(" ['']")
|
||||||
|
|
||||||
|
def getDirector(self, htmltree):
|
||||||
|
if "anime" not in self.detailurl:
|
||||||
|
return self.getFanzaString('監督:')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
if "anime" not in self.detailurl:
|
||||||
|
return super().getActors(htmltree)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
result = self.getFanzaString('発売日:')
|
||||||
|
if result == '' or result == '----':
|
||||||
|
result = self.getFanzaString('配信開始日:')
|
||||||
|
return result.replace("/", "-").strip('\\n')
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content')
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getFanzaStrings('ジャンル:')
|
||||||
|
|
||||||
|
def getLabel(self, htmltree):
|
||||||
|
ret = self.getFanzaStrings('レーベル')
|
||||||
|
if ret == "----":
|
||||||
|
return ''
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def getSeries(self, htmltree):
|
||||||
|
ret = self.getFanzaStrings('シリーズ:')
|
||||||
|
if ret == "----":
|
||||||
|
return ''
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def getFanzaString(self, expr):
|
||||||
|
result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']")
|
||||||
|
result2 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/text()")).strip(" ['']")
|
||||||
|
return result1+result2
|
||||||
|
|
||||||
|
def getFanzaStrings(self, string):
|
||||||
|
result1 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
|
||||||
|
if len(result1) > 0:
|
||||||
|
return result1
|
||||||
|
result2 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
|
||||||
|
return result2
|
||||||
71
scrapinglib/fc2.py
Normal file
71
scrapinglib/fc2.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Fc2(Parser):
|
||||||
|
source = 'fc2'
|
||||||
|
imagecut = 0
|
||||||
|
|
||||||
|
expr_title = '/html/head/title/text()'
|
||||||
|
expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
|
||||||
|
expr_release = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()'
|
||||||
|
expr_runtime = "//p[@class='items_article_info']/text()"
|
||||||
|
expr_director = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
|
||||||
|
expr_actor = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
|
||||||
|
expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src"
|
||||||
|
expr_tags = "//a[@class='tag tagTag']/text()"
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number.replace('FC2-', '').replace('fc2-', '')
|
||||||
|
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
|
||||||
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
|
if self.htmlcode == 404:
|
||||||
|
return 404
|
||||||
|
htmltree = etree.HTML(self.htmlcode)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return 'FC2-' + self.number
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return super().getRelease(htmltree).strip(" ['販売日 : ']").replace('/','-')
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
actors = super().getActors(htmltree)
|
||||||
|
if not actors:
|
||||||
|
actors = '素人'
|
||||||
|
return actors
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return urljoin('https://adult.contents.fc2.com', super().getCover(htmltree))
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
|
||||||
|
html = html_pather.search(self.htmlcode)
|
||||||
|
if html:
|
||||||
|
html = html.group()
|
||||||
|
extrafanart_pather = re.compile(r'<a href=\"(.*?)\"')
|
||||||
|
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||||
|
if extrafanart_imgs:
|
||||||
|
return extrafanart_imgs
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTrailer(self, htmltree):
|
||||||
|
video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'')
|
||||||
|
video = video_pather.findall(self.htmlcode)
|
||||||
|
if video:
|
||||||
|
try:
|
||||||
|
video_url = video[0].replace('\'', '')
|
||||||
|
video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + self.number + '/sample?key=' + video_url
|
||||||
|
url_json = eval(self.getHtml(video_url))['path'].replace('\\', '')
|
||||||
|
return url_json
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
73
scrapinglib/gcolle.py
Normal file
73
scrapinglib/gcolle.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from .httprequest import get_html_session
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Gcolle(Parser):
|
||||||
|
source = 'gcolle'
|
||||||
|
imagecut = 4
|
||||||
|
|
||||||
|
expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href'
|
||||||
|
expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()'
|
||||||
|
expr_title = '//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'
|
||||||
|
expr_studio = '//td[contains(text(),"アップロード会員名")]/b/text()'
|
||||||
|
expr_director = '//td[contains(text(),"アップロード会員名")]/b/text()'
|
||||||
|
expr_actor = '//td[contains(text(),"アップロード会員名")]/b/text()'
|
||||||
|
expr_label = '//td[contains(text(),"アップロード会員名")]/b/text()'
|
||||||
|
expr_series = '//td[contains(text(),"アップロード会員名")]/b/text()'
|
||||||
|
expr_release = '//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'
|
||||||
|
expr_cover = '//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'
|
||||||
|
expr_tags = '//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'
|
||||||
|
expr_outline = '//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'
|
||||||
|
expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src'
|
||||||
|
expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number.upper().replace('GCOLLE-','')
|
||||||
|
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
|
||||||
|
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
|
||||||
|
htmltree = etree.HTML(htmlcode)
|
||||||
|
|
||||||
|
r18url = self.getTreeElement(htmltree, self.expr_r18)
|
||||||
|
if r18url and r18url.startswith('http'):
|
||||||
|
htmlcode = session.get(r18url).text
|
||||||
|
htmltree = etree.HTML(htmlcode)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
num = super().getNum(htmltree)
|
||||||
|
if self.number != num:
|
||||||
|
raise Exception(f'[!] {self.number}: find [{num}] in gcolle, not match')
|
||||||
|
return "GCOLLE-" + str(num)
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
result = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
|
try:
|
||||||
|
return "\n".join(result)
|
||||||
|
except:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return re.findall('\d{4}-\d{2}-\d{2}', super().getRelease(htmltree))[0]
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return "https:" + super().getCover(htmltree)
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
|
if len(extrafanart) == 0:
|
||||||
|
extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart2)
|
||||||
|
# Add "https:" in each extrafanart url
|
||||||
|
for i in range(len(extrafanart)):
|
||||||
|
extrafanart[i] = 'https:' + extrafanart[i]
|
||||||
|
return extrafanart
|
||||||
|
|
||||||
|
|
||||||
150
scrapinglib/getchu.py
Normal file
150
scrapinglib/getchu.py
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from urllib.parse import quote
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Getchu():
|
||||||
|
source = 'getchu'
|
||||||
|
|
||||||
|
def scrape(self, number, core: None):
|
||||||
|
dl = dlGetchu()
|
||||||
|
www = wwwGetchu()
|
||||||
|
number = number.replace("-C", "")
|
||||||
|
dic = {}
|
||||||
|
if "item" in number:
|
||||||
|
sort = ["dl.scrape(number, core)", "www.scrape(number, core)"]
|
||||||
|
else:
|
||||||
|
sort = ["www.scrape(number, core)", "dl.scrape(number, core)"]
|
||||||
|
for i in sort:
|
||||||
|
try:
|
||||||
|
dic = eval(i)
|
||||||
|
if dic != None and json.loads(dic).get('title') != '':
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return dic
|
||||||
|
|
||||||
|
class wwwGetchu(Parser):
|
||||||
|
imagecut = 0
|
||||||
|
allow_number_change = True
|
||||||
|
|
||||||
|
cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
|
||||||
|
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
|
||||||
|
|
||||||
|
expr_title = '//*[@id="soft-title"]/text()'
|
||||||
|
expr_cover = "/html/body/div[1]/table[2]/tr[1]/td/a/@href"
|
||||||
|
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
||||||
|
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
||||||
|
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
|
||||||
|
expr_label = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
|
||||||
|
expr_release = "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
|
||||||
|
expr_tags = "//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"
|
||||||
|
expr_outline = "//div[contains(text(),'商品紹介')]/following-sibling::div/text()"
|
||||||
|
expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"
|
||||||
|
expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
self.number = quote(number, encoding="euc_jp")
|
||||||
|
queryUrl = self.GETCHU_WWW_SEARCH_URL.replace("_WORD_", self.number)
|
||||||
|
# NOTE dont know why will try 2 times
|
||||||
|
retry = 2
|
||||||
|
for i in range(retry):
|
||||||
|
queryTree = self.getHtmlTree(queryUrl)
|
||||||
|
detailurl = self.getTreeElement(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
|
||||||
|
if detailurl:
|
||||||
|
break
|
||||||
|
if detailurl == "":
|
||||||
|
return None
|
||||||
|
return detailurl.replace('../', 'http://www.getchu.com/')
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return "http://www.getchu.com" + super().getCover(htmltree).replace("./", '/')
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
return super().getDirector(htmltree)
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
outline = ''
|
||||||
|
_list = self.getTreeAll(htmltree, self.expr_outline)
|
||||||
|
for i in _list:
|
||||||
|
outline = outline + i.strip()
|
||||||
|
return outline
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
arts = super().getExtrafanart(htmltree)
|
||||||
|
extrafanart = []
|
||||||
|
for i in arts:
|
||||||
|
i = "http://www.getchu.com" + i.replace("./", '/')
|
||||||
|
if 'jpg' in i:
|
||||||
|
extrafanart.append(i)
|
||||||
|
return extrafanart
|
||||||
|
|
||||||
|
def extradict(self, dic: dict):
|
||||||
|
""" 额外新增的 headers
|
||||||
|
"""
|
||||||
|
dic['headers'] = {'referer': self.detailurl}
|
||||||
|
return dic
|
||||||
|
|
||||||
|
class dlGetchu(wwwGetchu):
|
||||||
|
""" 二者基本一致
|
||||||
|
headers extrafanart 略有区别
|
||||||
|
"""
|
||||||
|
|
||||||
|
imagecut = 4
|
||||||
|
allow_number_change = True
|
||||||
|
|
||||||
|
cookies = {"adult_check_flag": "1"}
|
||||||
|
extraheader = {"Referer": "https://dl.getchu.com/"}
|
||||||
|
|
||||||
|
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
|
||||||
|
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
|
||||||
|
|
||||||
|
expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"
|
||||||
|
expr_cover = "//td[contains(@bgcolor,'#ffffff')]/img/@src"
|
||||||
|
expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()"
|
||||||
|
expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
|
||||||
|
expr_label = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
|
||||||
|
expr_runtime = "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()"
|
||||||
|
expr_release = "//td[contains(text(),'配信開始日')]/following-sibling::td/text()"
|
||||||
|
expr_tags = "//td[contains(text(),'趣向')]/following-sibling::td/a/text()"
|
||||||
|
expr_outline = "//*[contains(text(),'作品内容')]/following-sibling::td/text()"
|
||||||
|
expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href"
|
||||||
|
expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
if "item" in number or 'GETCHU' in number.upper():
|
||||||
|
self.number = re.findall('\d+',number)[0]
|
||||||
|
else:
|
||||||
|
queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number)
|
||||||
|
queryTree = self.getHtmlTree(queryUrl)
|
||||||
|
detailurl = self.getTreeElement(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
|
||||||
|
if detailurl == "":
|
||||||
|
return None
|
||||||
|
self.number = re.findall('\d+', detailurl)[0]
|
||||||
|
return self.GETCHU_DL_URL.replace("_WORD_", self.number)
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return 'GETCHU-' + re.findall('\d+', self.number)[0]
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return "https://dl.getchu.com" + super().getCover(htmltree)
|
||||||
|
|
||||||
|
def extradict(self, dic: dict):
|
||||||
|
return dic
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
|
extrafanart = []
|
||||||
|
for i in arts:
|
||||||
|
i = "https://dl.getchu.com" + i
|
||||||
|
extrafanart.append(i)
|
||||||
|
return extrafanart
|
||||||
250
scrapinglib/httprequest.py
Normal file
250
scrapinglib/httprequest.py
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import mechanicalsoup
|
||||||
|
import requests
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
from cloudscraper import create_scraper
|
||||||
|
|
||||||
|
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
|
||||||
|
G_DEFAULT_TIMEOUT = 10
|
||||||
|
|
||||||
|
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
|
||||||
|
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
|
"""
|
||||||
|
网页请求核心函数
|
||||||
|
|
||||||
|
是否使用代理应由上层处理
|
||||||
|
"""
|
||||||
|
errors = ""
|
||||||
|
headers = {"User-Agent": ua or G_USER_AGENT}
|
||||||
|
if extra_headers != None:
|
||||||
|
headers.update(extra_headers)
|
||||||
|
for i in range(retry):
|
||||||
|
try:
|
||||||
|
result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies,
|
||||||
|
verify=verify, cookies=cookies)
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or result.apparent_encoding
|
||||||
|
return result.text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-]Connect: {url} retry {i + 1}/{retry}")
|
||||||
|
errors = str(e)
|
||||||
|
if "getaddrinfo failed" in errors:
|
||||||
|
print("[-]Connect Failed! Please Check your proxy config")
|
||||||
|
print("[-]" + errors)
|
||||||
|
else:
|
||||||
|
print("[-]" + errors)
|
||||||
|
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||||
|
raise Exception('Connect Failed')
|
||||||
|
|
||||||
|
|
||||||
|
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
|
||||||
|
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
|
"""
|
||||||
|
是否使用代理应由上层处理
|
||||||
|
"""
|
||||||
|
errors = ""
|
||||||
|
headers = {"User-Agent": ua or G_USER_AGENT}
|
||||||
|
|
||||||
|
for i in range(retry):
|
||||||
|
try:
|
||||||
|
result = requests.post(url, data=data, files=files, headers=headers, timeout=timeout, proxies=proxies,
|
||||||
|
verify=verify, cookies=cookies)
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or result.apparent_encoding
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-]Connect: {url} retry {i + 1}/{retry}")
|
||||||
|
errors = str(e)
|
||||||
|
if "getaddrinfo failed" in errors:
|
||||||
|
print("[-]Connect Failed! Please Check your proxy config")
|
||||||
|
print("[-]" + errors)
|
||||||
|
else:
|
||||||
|
print("[-]" + errors)
|
||||||
|
print('[-]Connect Failed! Please check your Proxy or Network!')
|
||||||
|
raise Exception('Connect Failed')
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# TODO: 以下临时使用,更新完各站后,再更新
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
class TimeoutHTTPAdapter(HTTPAdapter):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.timeout = G_DEFAULT_TIMEOUT
|
||||||
|
if "timeout" in kwargs:
|
||||||
|
self.timeout = kwargs["timeout"]
|
||||||
|
del kwargs["timeout"]
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def send(self, request, **kwargs):
|
||||||
|
timeout = kwargs.get("timeout")
|
||||||
|
if timeout is None:
|
||||||
|
kwargs["timeout"] = self.timeout
|
||||||
|
return super().send(request, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# with keep-alive feature
|
||||||
|
# storyline carib gcolle javdb only
|
||||||
|
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
|
||||||
|
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
|
session = requests.Session()
|
||||||
|
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
||||||
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
|
if verify:
|
||||||
|
session.verify = verify
|
||||||
|
if proxies:
|
||||||
|
session.proxies = proxies
|
||||||
|
session.headers = {"User-Agent": ua or G_USER_AGENT}
|
||||||
|
try:
|
||||||
|
if isinstance(url, str) and len(url):
|
||||||
|
result = session.get(str(url))
|
||||||
|
else: # 空url参数直接返回可重用session对象,无需设置return_type
|
||||||
|
return session
|
||||||
|
if not result.ok:
|
||||||
|
return None
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
elif return_type == "session":
|
||||||
|
return result, session
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return result.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_session() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-]get_html_session() failed. {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# storyline only
|
||||||
|
# 使用 cloudscraper....
|
||||||
|
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
||||||
|
encoding: str = None, use_scraper: bool = False,
|
||||||
|
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
|
session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
|
||||||
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
|
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
||||||
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
if verify:
|
||||||
|
session.verify = verify
|
||||||
|
if proxies:
|
||||||
|
session.proxies = proxies
|
||||||
|
try:
|
||||||
|
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
|
||||||
|
if isinstance(url, str) and len(url):
|
||||||
|
result = browser.open(url)
|
||||||
|
else:
|
||||||
|
return browser
|
||||||
|
if not result.ok:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
elif return_type == "browser":
|
||||||
|
return result, browser
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return result.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[-]get_html_by_browser() Failed! {e}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
# storyline xcity only
|
||||||
|
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
|
||||||
|
return_type: str = None, encoding: str = None,
|
||||||
|
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
|
||||||
|
session = requests.Session()
|
||||||
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
|
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
||||||
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
if verify:
|
||||||
|
session.verify = verify
|
||||||
|
if proxies:
|
||||||
|
session.proxies = proxies
|
||||||
|
try:
|
||||||
|
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
|
||||||
|
result = browser.open(url)
|
||||||
|
if not result.ok:
|
||||||
|
return None
|
||||||
|
form = browser.select_form() if form_select is None else browser.select_form(form_select)
|
||||||
|
if isinstance(fields, dict):
|
||||||
|
for k, v in fields.items():
|
||||||
|
browser[k] = v
|
||||||
|
response = browser.submit_selected()
|
||||||
|
|
||||||
|
if return_type == "object":
|
||||||
|
return response
|
||||||
|
elif return_type == "content":
|
||||||
|
return response.content
|
||||||
|
elif return_type == "browser":
|
||||||
|
return response, browser
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return response.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_by_form() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f'[-]get_html_by_form() Failed! {e}')
|
||||||
|
return None
|
||||||
|
|
||||||
|
# storyline javdb only
|
||||||
|
def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
|
||||||
|
encoding: str = None, retry: int = 3, proxies=None, timeout: int = G_DEFAULT_TIMEOUT, verify=None):
|
||||||
|
session = create_scraper(browser={'custom': ua or G_USER_AGENT, })
|
||||||
|
if isinstance(cookies, dict) and len(cookies):
|
||||||
|
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
|
||||||
|
retries = Retry(total=retry, connect=retry, backoff_factor=1,
|
||||||
|
status_forcelist=[429, 500, 502, 503, 504])
|
||||||
|
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
|
||||||
|
if verify:
|
||||||
|
session.verify = verify
|
||||||
|
if proxies:
|
||||||
|
session.proxies = proxies
|
||||||
|
try:
|
||||||
|
if isinstance(url, str) and len(url):
|
||||||
|
result = session.get(str(url))
|
||||||
|
else: # 空url参数直接返回可重用scraper对象,无需设置return_type
|
||||||
|
return session
|
||||||
|
if not result.ok:
|
||||||
|
return None
|
||||||
|
if return_type == "object":
|
||||||
|
return result
|
||||||
|
elif return_type == "content":
|
||||||
|
return result.content
|
||||||
|
elif return_type == "scraper":
|
||||||
|
return result, session
|
||||||
|
else:
|
||||||
|
result.encoding = encoding or "utf-8"
|
||||||
|
return result.text
|
||||||
|
except requests.exceptions.ProxyError:
|
||||||
|
print("[-]get_html_by_scraper() Proxy error! Please check your Proxy")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[-]get_html_by_scraper() failed. {e}")
|
||||||
|
return None
|
||||||
83
scrapinglib/jav321.py
Normal file
83
scrapinglib/jav321.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from . import httprequest
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Jav321(Parser):
|
||||||
|
source = 'jav321'
|
||||||
|
|
||||||
|
expr_title = "/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()"
|
||||||
|
expr_cover = "/html/body/div[2]/div[2]/div[1]/p/a/img/@src"
|
||||||
|
expr_outline = "/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()"
|
||||||
|
# NOTE: 统一使用 xpath
|
||||||
|
expr_number = '//b[contains(text(),"品番")]/following-sibling::node()'
|
||||||
|
expr_actor = '//b[contains(text(),"出演者")]/following-sibling::a[starts-with(@href,"/star")]'
|
||||||
|
expr_label = '//b[contains(text(),"メーカー")]/following-sibling::a[starts-with(@href,"/company")]'
|
||||||
|
expr_tags = '//b[contains(text(),"ジャンル")]/following-sibling::a[starts-with(@href,"/genre")]'
|
||||||
|
expr_studio = '//b[contains(text(),"メーカー")]/following-sibling::a[starts-with(@href,"/company")]'
|
||||||
|
expr_release = '//b[contains(text(),"配信開始日")]/following-sibling::node()'
|
||||||
|
expr_runtime = '//b[contains(text(),"収録時間")]/following-sibling::node()'
|
||||||
|
# expr_series = '//b[contains(text(),"シリーズ")]'
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
return 'https://www.jav321.com/search'
|
||||||
|
|
||||||
|
def getHtmlTree(self, url):
|
||||||
|
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
if "/video/" in resp.url:
|
||||||
|
self.detailurl = resp.url
|
||||||
|
self.detailhtml = resp.text
|
||||||
|
return etree.fromstring(resp.text, etree.HTMLParser())
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return super().getNum(htmltree).split(": ")[1]
|
||||||
|
|
||||||
|
def getTrailer(self, htmltree):
|
||||||
|
videourl_pather = re.compile(r'<source src=\"(.*?)\"')
|
||||||
|
videourl = videourl_pather.findall(self.detailhtml)
|
||||||
|
if videourl:
|
||||||
|
url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp')
|
||||||
|
return url
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">')
|
||||||
|
html = html_pather.search(self.detailhtml)
|
||||||
|
if html:
|
||||||
|
html = html.group()
|
||||||
|
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
|
||||||
|
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||||
|
if extrafanart_imgs:
|
||||||
|
return extrafanart_imgs
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return super().getRelease(htmltree).split(": ")[1]
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
return super().getRuntime(htmltree).split(": ")[1]
|
||||||
|
|
||||||
|
def parseElement(self, all):
|
||||||
|
if all:
|
||||||
|
ret = []
|
||||||
|
for si in all:
|
||||||
|
ret.append(si.text)
|
||||||
|
return ",".join(ret)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
return self.parseElement(super().getActors(htmltree))
|
||||||
|
|
||||||
|
def getLabel(self, htmltree):
|
||||||
|
return self.parseElement(self.getTreeAll(htmltree, self.expr_label))
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.parseElement(self.getTreeAll(htmltree, self.expr_tags))
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
return self.parseElement(self.getTreeAll(htmltree, self.expr_studio))
|
||||||
145
scrapinglib/javbus.py
Normal file
145
scrapinglib/javbus.py
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import secrets
|
||||||
|
import inspect
|
||||||
|
from lxml import etree
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
class Javbus(Parser):
|
||||||
|
|
||||||
|
source = 'javbus'
|
||||||
|
|
||||||
|
expr_number = '/html/head/meta[@name="keywords"]/@content'
|
||||||
|
expr_title = '/html/head/title/text()'
|
||||||
|
expr_studio = '//span[contains(text(),"製作商:")]/../a/text()'
|
||||||
|
expr_studio2 = '//span[contains(text(),"メーカー:")]/../a/text()'
|
||||||
|
expr_director = '//span[contains(text(),"導演:")]/../a/text()'
|
||||||
|
expr_directorJa = '//span[contains(text(),"監督:")]/../a/text()'
|
||||||
|
expr_series = '//span[contains(text(),"系列:")]/../a/text()'
|
||||||
|
expr_series2 = '//span[contains(text(),"シリーズ:")]/../a/text()'
|
||||||
|
expr_label = '//span[contains(text(),"系列:")]/../a/text()'
|
||||||
|
expr_cover = '//a[@class="bigImage"]/@href'
|
||||||
|
expr_release = '/html/body/div[5]/div[1]/div[2]/p[2]/text()'
|
||||||
|
expr_runtime = '/html/body/div[5]/div[1]/div[2]/p[3]/text()'
|
||||||
|
expr_actor = '//div[@class="star-name"]/a'
|
||||||
|
expr_actorphoto = '//div[@class="star-name"]/../a/img'
|
||||||
|
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||||
|
expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]'
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number
|
||||||
|
try:
|
||||||
|
url = "https://www." + secrets.choice([
|
||||||
|
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
|
||||||
|
'cdnbus.fun',
|
||||||
|
'dmmbus.fun', 'dmmsee.fun',
|
||||||
|
'fanbus.us',
|
||||||
|
'seedmm.fun',
|
||||||
|
]) + "/"
|
||||||
|
try:
|
||||||
|
self.detailurl = url + number
|
||||||
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
|
except:
|
||||||
|
self.detailurl = 'https://www.javbus.com/' + number
|
||||||
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
|
if self.htmlcode == 404:
|
||||||
|
return 404
|
||||||
|
htmltree = etree.fromstring(self.htmlcode,etree.HTMLParser())
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
except:
|
||||||
|
self.searchUncensored(number)
|
||||||
|
|
||||||
|
def searchUncensored(self, number):
|
||||||
|
""" 二次搜索无码
|
||||||
|
"""
|
||||||
|
self.imagecut = 0
|
||||||
|
self.uncensored = True
|
||||||
|
|
||||||
|
w_number = number.replace('.', '-')
|
||||||
|
self.detailurl = 'https://www.javbus.red/' + w_number
|
||||||
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
|
if self.htmlcode == 404:
|
||||||
|
return 404
|
||||||
|
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
return super().getNum(htmltree).split(',')[0]
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
title = super().getTitle(htmltree)
|
||||||
|
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
|
||||||
|
return title
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
if self.uncensored:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_studio2)
|
||||||
|
else:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_studio)
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return urljoin("https://www.javbus.com", super().getCover(htmltree))
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return super().getRelease(htmltree).strip(" ['']")
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
return super().getRuntime(htmltree).strip(" ['']分鐘")
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
actors = super().getActors(htmltree)
|
||||||
|
b=[]
|
||||||
|
for i in actors:
|
||||||
|
b.append(i.attrib['title'])
|
||||||
|
return b
|
||||||
|
|
||||||
|
def getActorPhoto(self, htmltree):
|
||||||
|
actors = super().getActorPhoto(htmltree)
|
||||||
|
d = {}
|
||||||
|
for i in actors:
|
||||||
|
p = i.attrib['src']
|
||||||
|
if "nowprinting.gif" in p:
|
||||||
|
continue
|
||||||
|
t = i.attrib['title']
|
||||||
|
d[t] = urljoin("https://www.javbus.com", p)
|
||||||
|
return d
|
||||||
|
|
||||||
|
def getDirector(self, htmltree):
|
||||||
|
if self.uncensored:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_directorJa)
|
||||||
|
else:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_director)
|
||||||
|
|
||||||
|
def getSeries(self, htmltree):
|
||||||
|
if self.uncensored:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_series2)
|
||||||
|
else:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_series)
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
tags = super().getTags(htmltree).split(',')
|
||||||
|
return tags[1:]
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
|
||||||
|
html = html_pather.search(self.htmlcode)
|
||||||
|
if html:
|
||||||
|
html = html.group()
|
||||||
|
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
|
||||||
|
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||||
|
if extrafanart_imgs:
|
||||||
|
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
if self.morestoryline:
|
||||||
|
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
|
||||||
|
return '' # 从airav.py过来的调用不计算outline直接返回,避免重复抓取数据拖慢处理速度
|
||||||
|
from .storyline import getStoryline
|
||||||
|
return getStoryline(self.number , uncensored = self.uncensored)
|
||||||
|
return ''
|
||||||
260
scrapinglib/javdb.py
Normal file
260
scrapinglib/javdb.py
Normal file
@@ -0,0 +1,260 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from lxml import etree
|
||||||
|
from .httprequest import get_html_session
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Javdb(Parser):
|
||||||
|
source = 'javdb'
|
||||||
|
|
||||||
|
fixstudio = False
|
||||||
|
noauth = False
|
||||||
|
|
||||||
|
expr_number = '//strong[contains(text(),"番號")]/../span/text()'
|
||||||
|
expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()'
|
||||||
|
expr_title = "/html/head/title/text()"
|
||||||
|
expr_title_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/text()'
|
||||||
|
expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
|
||||||
|
expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
|
||||||
|
expr_uncensored = '//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?") or contains(@href,"/tags/western?")]'
|
||||||
|
expr_actor = '//span[@class="value"]/a[contains(@href,"/actors/")]/text()'
|
||||||
|
expr_actor2 = '//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class'
|
||||||
|
expr_release = '//strong[contains(text(),"日期")]/../span/text()'
|
||||||
|
expr_release_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "meta")]/text()'
|
||||||
|
expr_studio = '//strong[contains(text(),"片商")]/../span/a/text()'
|
||||||
|
expr_studio2 = '//strong[contains(text(),"賣家:")]/../span/a/text()'
|
||||||
|
expr_director = '//strong[contains(text(),"導演")]/../span/text()'
|
||||||
|
expr_director2 = '//strong[contains(text(),"導演")]/../span/a/text()'
|
||||||
|
expr_cover = "//div[contains(@class, 'column-video-cover')]/a/img/@src"
|
||||||
|
expr_cover2 = "//div[contains(@class, 'column-video-cover')]/img/@src"
|
||||||
|
expr_cover_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "cover")]/img/@src'
|
||||||
|
expr_extrafanart = "//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href"
|
||||||
|
expr_tags = '//strong[contains(text(),"類別")]/../span/a/text()'
|
||||||
|
expr_tags2 = '//strong[contains(text(),"類別")]/../span/text()'
|
||||||
|
expr_series = '//strong[contains(text(),"系列")]/../span/text()'
|
||||||
|
expr_series2 = '//strong[contains(text(),"系列")]/../span/a/text()'
|
||||||
|
expr_label = '//strong[contains(text(),"系列")]/../span/text()'
|
||||||
|
expr_label2 = '//strong[contains(text(),"系列")]/../span/a/text()'
|
||||||
|
expr_userrating = '//span[@class="score-stars"]/../text()'
|
||||||
|
expr_uservotes = '//span[@class="score-stars"]/../text()'
|
||||||
|
expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]'
|
||||||
|
|
||||||
|
def updateCore(self, core):
|
||||||
|
if core.proxies:
|
||||||
|
self.proxies = core.proxies
|
||||||
|
if core.verify:
|
||||||
|
self.verify = core.verify
|
||||||
|
if core.morestoryline:
|
||||||
|
self.morestoryline = True
|
||||||
|
# special
|
||||||
|
if core.dbcookies:
|
||||||
|
self.cookies = core.dbcookies
|
||||||
|
else:
|
||||||
|
self.cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
|
||||||
|
if core.dbsite:
|
||||||
|
self.dbsite = core.dbsite
|
||||||
|
else:
|
||||||
|
self.dbsite = 'javdb'
|
||||||
|
|
||||||
|
def search(self, number: str):
|
||||||
|
self.number = number
|
||||||
|
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
|
||||||
|
self.detailurl = self.queryNumberUrl(number)
|
||||||
|
self.deatilpage = self.session.get(self.detailurl).text
|
||||||
|
if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
|
||||||
|
self.noauth = True
|
||||||
|
self.imagecut = 0
|
||||||
|
result = self.dictformat(self.querytree)
|
||||||
|
else:
|
||||||
|
htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser())
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
javdb_url = 'https://' + self.dbsite + '.com/search?q=' + number + '&f=all'
|
||||||
|
try:
|
||||||
|
resp = self.session.get(javdb_url)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
raise Exception(f'[!] {self.number}: page not fond in javdb')
|
||||||
|
|
||||||
|
self.querytree = etree.fromstring(resp.text, etree.HTMLParser())
|
||||||
|
# javdb sometime returns multiple results,
|
||||||
|
# and the first elememt maybe not the one we are looking for
|
||||||
|
# iterate all candidates and find the match one
|
||||||
|
urls = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href')
|
||||||
|
# 记录一下欧美的ids ['Blacked','Blacked']
|
||||||
|
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
|
||||||
|
correct_url = urls[0]
|
||||||
|
else:
|
||||||
|
ids = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
|
||||||
|
try:
|
||||||
|
self.queryid = ids.index(number)
|
||||||
|
correct_url = urls[self.queryid]
|
||||||
|
except:
|
||||||
|
# 为避免获得错误番号,只要精确对应的结果
|
||||||
|
if ids[0].upper() != number:
|
||||||
|
raise ValueError("number not found in javdb")
|
||||||
|
correct_url = urls[0]
|
||||||
|
return urljoin(resp.url, correct_url)
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
if self.noauth:
|
||||||
|
return self.number
|
||||||
|
result1 = str(self.getTreeAll(htmltree, self.expr_number)).strip(" ['']")
|
||||||
|
result2 = str(self.getTreeAll(htmltree, self.expr_number2)).strip(" ['']")
|
||||||
|
dp_number = str(result2 + result1).strip('+')
|
||||||
|
# NOTE 检测匹配与更新 self.number
|
||||||
|
if dp_number.upper() != self.number.upper():
|
||||||
|
raise Exception(f'[!] {self.number}: find [{dp_number}] in javdb, not match')
|
||||||
|
self.number = dp_number
|
||||||
|
return self.number
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
if self.noauth:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_title_no, self.queryid)
|
||||||
|
browser_title = super().getTitle(htmltree)
|
||||||
|
title = browser_title[:browser_title.find(' | JavDB')].strip()
|
||||||
|
return title.replace(self.number, '').strip()
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
if self.noauth:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_cover_no, self.queryid)
|
||||||
|
return super().getCover(htmltree)
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
if self.noauth:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_release_no, self.queryid).strip()
|
||||||
|
return super().getRelease(htmltree)
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
result1 = str(self.getTreeAll(htmltree, self.expr_runtime)).strip(" ['']")
|
||||||
|
result2 = str(self.getTreeAll(htmltree, self.expr_runtime2)).strip(" ['']")
|
||||||
|
return str(result1 + result2).strip('+').rstrip('mi')
|
||||||
|
|
||||||
|
def getDirector(self, htmltree):
|
||||||
|
result1 = str(self.getTreeAll(htmltree, self.expr_director)).strip(" ['']")
|
||||||
|
result2 = str(self.getTreeAll(htmltree, self.expr_director2)).strip(" ['']")
|
||||||
|
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
|
|
||||||
|
def getSeries(self, htmltree):
|
||||||
|
result1 = str(self.getTreeAll(htmltree, self.expr_series)).strip(" ['']")
|
||||||
|
result2 = str(self.getTreeAll(htmltree, self.expr_series2)).strip(" ['']")
|
||||||
|
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
|
if not result and self.fixstudio:
|
||||||
|
result = self.getStudio(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getLabel(self, htmltree):
|
||||||
|
result1 = str(self.getTreeAll(htmltree, self.expr_label)).strip(" ['']")
|
||||||
|
result2 = str(self.getTreeAll(htmltree, self.expr_label2)).strip(" ['']")
|
||||||
|
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
|
||||||
|
if not result and self.fixstudio:
|
||||||
|
result = self.getStudio(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
actors = self.getTreeAll(htmltree, self.expr_actor)
|
||||||
|
genders = self.getTreeAll(htmltree, self.expr_actor2)
|
||||||
|
r = []
|
||||||
|
idx = 0
|
||||||
|
# NOTE only female, we dont care others
|
||||||
|
actor_gendor = 'female'
|
||||||
|
for act in actors:
|
||||||
|
if((actor_gendor == 'all')
|
||||||
|
or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male'])
|
||||||
|
or (actor_gendor == 'female' and genders[idx] == 'symbol female')
|
||||||
|
or (actor_gendor == 'male' and genders[idx] == 'symbol male')):
|
||||||
|
r.append(act)
|
||||||
|
idx = idx + 1
|
||||||
|
if re.match(r'FC2-[\d]+', self.number, re.A) and not r:
|
||||||
|
r = '素人'
|
||||||
|
self.fixstudio = True
|
||||||
|
return r
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
if self.morestoryline:
|
||||||
|
from .storyline import getStoryline
|
||||||
|
return getStoryline(self.number, self.getUncensored(htmltree))
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeAll(htmltree, self.expr_studio).strip(" ['']")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeAll(htmltree, self.expr_studio2).strip(" ['']")
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTrailer(self, htmltree):
|
||||||
|
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
|
||||||
|
video = video_pather.findall(self.deatilpage)
|
||||||
|
# 加上数组判空
|
||||||
|
if video and video[0] != "":
|
||||||
|
if not 'https:' in video[0]:
|
||||||
|
video_url = 'https:' + video[0]
|
||||||
|
else:
|
||||||
|
video_url = video[0]
|
||||||
|
else:
|
||||||
|
video_url = ''
|
||||||
|
return video_url
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags2)
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getUserRating(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = str(self.getTreeElement(htmltree, self.expr_userrating))
|
||||||
|
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
||||||
|
return float(v[0][0])
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
|
||||||
|
def getUserVotes(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = str(self.getTreeElement(htmltree, self.expr_uservotes))
|
||||||
|
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
|
||||||
|
return int(v[0][1])
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
|
||||||
|
def getaphoto(self, url, session):
|
||||||
|
html_page = session.get(url).text
|
||||||
|
img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page)
|
||||||
|
return img_url[0] if img_url else ''
|
||||||
|
|
||||||
|
def getActorPhoto(self, htmltree):
|
||||||
|
actorall = self.getTreeAll(htmltree, self.expr_actorphoto)
|
||||||
|
if not actorall:
|
||||||
|
return {}
|
||||||
|
actors = self.getActors(htmltree)
|
||||||
|
actor_photo = {}
|
||||||
|
for i in actorall:
|
||||||
|
x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A)
|
||||||
|
if not len(x) or not len(x[0]) or i.text not in actors:
|
||||||
|
continue
|
||||||
|
# NOTE: https://c1.jdbstatic.com 会经常变动,直接使用页面内的地址获取
|
||||||
|
# actor_id = x[0]
|
||||||
|
# pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg"
|
||||||
|
# if not self.session.head(pic_url).ok:
|
||||||
|
try:
|
||||||
|
pic_url = self.getaphoto(urljoin('https://javdb.com', i.attrib['href']), self.session)
|
||||||
|
if len(pic_url):
|
||||||
|
actor_photo[i.text] = pic_url
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return actor_photo
|
||||||
|
|
||||||
63
scrapinglib/madou.py
Normal file
63
scrapinglib/madou.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from urllib.parse import urlparse, unquote
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Madou(Parser):
|
||||||
|
source = 'madou'
|
||||||
|
uncensored = True
|
||||||
|
|
||||||
|
expr_url = '//a[@class="share-weixin"]/@data-url'
|
||||||
|
expr_title = "/html/head/title/text()"
|
||||||
|
expr_studio = '//a[@rel="category tag"]/text()'
|
||||||
|
expr_tags = '/html/head/meta[@name="keywords"]/@content'
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number.lower().strip()
|
||||||
|
self.detailurl = "https://madou.club/" + number + ".html"
|
||||||
|
self.htmlcode = self.getHtml(self.detailurl)
|
||||||
|
if self.htmlcode == 404:
|
||||||
|
return 404
|
||||||
|
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||||
|
self.detailurl = self.getTreeElement(htmltree, self.expr_url)
|
||||||
|
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
try:
|
||||||
|
# 解码url
|
||||||
|
filename = unquote(urlparse(self.detailurl).path)
|
||||||
|
# 裁剪文件名
|
||||||
|
result = filename[1:-5].upper().strip()
|
||||||
|
# 移除中文
|
||||||
|
if result.upper() != self.number.upper():
|
||||||
|
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
|
||||||
|
# 移除多余的符号
|
||||||
|
return result.strip('-')
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
|
||||||
|
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
|
||||||
|
# <title>MD0094/贫嘴贱舌中出大嫂/坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
|
||||||
|
# <title>TM0002-我的痴女女友-麻豆社</title>
|
||||||
|
browser_title = str(super().getTitle(htmltree))
|
||||||
|
title = str(re.findall(r'^[A-Z0-9 //\-]*(.*)-麻豆社$', browser_title)[0]).strip()
|
||||||
|
return title
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
try:
|
||||||
|
url = str(re.findall("shareimage : '(.*?)'", self.htmlcode)[0])
|
||||||
|
return url.strip()
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
studio = self.getStudio(htmltree)
|
||||||
|
x = super().getTags(htmltree).split(',')
|
||||||
|
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
|
||||||
52
scrapinglib/mgstage.py
Normal file
52
scrapinglib/mgstage.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from .utils import getTreeElement
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Mgstage(Parser):
|
||||||
|
source = 'mgstage'
|
||||||
|
|
||||||
|
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
|
||||||
|
expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
|
||||||
|
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
|
||||||
|
expr_outline = '//dl[@id="introduction"]/dd/p/text()'
|
||||||
|
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
|
||||||
|
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
||||||
|
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
|
||||||
|
expr_release = '//th[contains(text(),"配信開始日:")]/../td/a/text()'
|
||||||
|
expr_cover = '//*[@id="EnlargeImage"]/@href'
|
||||||
|
expr_label = '//th[contains(text(),"シリーズ:")]/../td/a/text()'
|
||||||
|
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
|
||||||
|
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
|
||||||
|
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
|
||||||
|
expr_extrafanart = '//a[@class="sample_image"]/@href'
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number.upper()
|
||||||
|
self.cookies = {'adc':'1'}
|
||||||
|
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
|
||||||
|
htmltree =self.getHtmlTree(self.detailurl)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
return super().getTitle(htmltree).replace('/', ',').strip()
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
results = self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
results2 = self.getTreeAll(htmltree, self.expr_tags2)
|
||||||
|
return [ x.strip() for x in (results + results2) if x.strip()]
|
||||||
|
|
||||||
|
def getTreeAll(self, tree, expr):
|
||||||
|
alls = super().getTreeAll(tree, expr)
|
||||||
|
return [ x.strip() for x in alls ]
|
||||||
|
|
||||||
|
def getTreeElement(self, tree, expr, index=0):
|
||||||
|
if expr == '':
|
||||||
|
return ''
|
||||||
|
result1 = getTreeElement(tree, expr).strip().replace("', '", '').strip(" ['']")
|
||||||
|
result2 = getTreeElement(tree, expr.replace('td/a/','td/')).strip().replace("', '", '').strip(" ['']")
|
||||||
|
if result1 == result2:
|
||||||
|
return str(result1).strip('+').replace("', '",'').replace('"','')
|
||||||
|
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
|
||||||
93
scrapinglib/mv91.py
Normal file
93
scrapinglib/mv91.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
from lxml import etree
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Mv91(Parser):
|
||||||
|
source = 'mv91'
|
||||||
|
|
||||||
|
expr_number = '//div[@class="player-title"]/text()'
|
||||||
|
expr_title = '//div[@class="player-title"]/text()'
|
||||||
|
expr_release = '//p[@class="date"]/text()'
|
||||||
|
expr_outline = '//div[@class="play-text"]/text()'
|
||||||
|
expr_tags = '//div[@class="player-tag"]/text()'
|
||||||
|
expr_actor = '//p[@class="player-name"]/text()'
|
||||||
|
|
||||||
|
def getHtmlTree(self, url, type=None):
|
||||||
|
self.htmlcode = self.getHtml(url, type)
|
||||||
|
if self.htmlcode == 404:
|
||||||
|
return 404
|
||||||
|
ret = etree.fromstring(self.htmlcode, etree.HTMLParser())
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
keyword = number.replace('91CM-','').replace('91MS-','')
|
||||||
|
search_html = self.getHtml('https://www.91mv.org/index/search?keywords=' + keyword)
|
||||||
|
html = etree.fromstring(search_html, etree.HTMLParser())
|
||||||
|
endurl = html.xpath('//a[@class="video-list"]/@href')[0]
|
||||||
|
return 'https://www.91mv.org' + endurl
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
try:
|
||||||
|
num = super().getNum(htmltree)
|
||||||
|
finds = re.findall('(.*)(91.*-\d*)',num)
|
||||||
|
if finds:
|
||||||
|
result = str(finds[0][1])
|
||||||
|
else:
|
||||||
|
result = ' '.join(num.replace('/',' ').split())
|
||||||
|
result = result.split()[1]
|
||||||
|
if self.number.upper() != result.upper():
|
||||||
|
raise Exception(f'[!] {self.number}: find {result} in mv91, not match')
|
||||||
|
return result.strip()
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
try:
|
||||||
|
title = super().getTitle(htmltree)
|
||||||
|
finds = re.findall('(.*)(91.*-\d*)',title)
|
||||||
|
if finds:
|
||||||
|
result = str(finds[0][0])
|
||||||
|
else:
|
||||||
|
result = ' '.join(title.replace('/',' ').split())
|
||||||
|
result = result.split()[0].replace('「预告」','')
|
||||||
|
return result.strip()
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
return '91制片厂'
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
b=[]
|
||||||
|
for player in self.getTreeAll(htmltree, self.expr_actor):
|
||||||
|
player = player.replace('主演:','')
|
||||||
|
if '/' in player:
|
||||||
|
player = player.split('/')[0]
|
||||||
|
player = re.sub(r'[0-9]+', '', player)
|
||||||
|
b.append(player)
|
||||||
|
return b
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = super().getRelease(htmltree)
|
||||||
|
date = result.replace('日期:','')
|
||||||
|
if isinstance(date, str) and len(date):
|
||||||
|
return date
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
try:
|
||||||
|
url = str(re.findall('var pic_url = "(.*?)"', self.htmlcode)[0])
|
||||||
|
return url.strip()
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
265
scrapinglib/parser.py
Normal file
265
scrapinglib/parser.py
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from lxml import etree, html
|
||||||
|
|
||||||
|
from . import httprequest
|
||||||
|
from .utils import getTreeElement, getTreeAll
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
|
||||||
|
source = 'base'
|
||||||
|
# poster: `0` 复制 `1` 裁剪
|
||||||
|
imagecut = 1
|
||||||
|
uncensored = False
|
||||||
|
allow_number_change = False
|
||||||
|
# update
|
||||||
|
proxies = None
|
||||||
|
verify = None
|
||||||
|
extraheader = None
|
||||||
|
cookies = None
|
||||||
|
morestoryline = False
|
||||||
|
|
||||||
|
number = ''
|
||||||
|
detailurl = ''
|
||||||
|
# xpath expr
|
||||||
|
expr_number = ''
|
||||||
|
expr_title = ''
|
||||||
|
expr_studio = ''
|
||||||
|
expr_studio2 = ''
|
||||||
|
expr_runtime = ''
|
||||||
|
expr_runtime2 = ''
|
||||||
|
expr_release = ''
|
||||||
|
expr_outline = ''
|
||||||
|
expr_director = ''
|
||||||
|
expr_actor = ''
|
||||||
|
expr_tags = ''
|
||||||
|
expr_label = ''
|
||||||
|
expr_label2 = ''
|
||||||
|
expr_series = ''
|
||||||
|
expr_series2 = ''
|
||||||
|
expr_cover = ''
|
||||||
|
expr_cover2 = ''
|
||||||
|
expr_smallcover = ''
|
||||||
|
expr_extrafanart = ''
|
||||||
|
expr_trailer = ''
|
||||||
|
expr_actorphoto = ''
|
||||||
|
expr_uncensored = ''
|
||||||
|
expr_userrating = ''
|
||||||
|
expr_uservotes = ''
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def scrape(self, number, core: None):
|
||||||
|
""" 刮削番号
|
||||||
|
"""
|
||||||
|
self.updateCore(core)
|
||||||
|
result = self.search(number)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number
|
||||||
|
self.detailurl = self.queryNumberUrl(number)
|
||||||
|
htmltree = self.getHtmlTree(self.detailurl)
|
||||||
|
result = self.dictformat(htmltree)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def updateCore(self, core):
|
||||||
|
""" 从`core`内更新参数
|
||||||
|
|
||||||
|
针对需要传递的参数: cookies, proxy等
|
||||||
|
子类继承后修改
|
||||||
|
"""
|
||||||
|
if core.proxies:
|
||||||
|
self.proxies = core.proxies
|
||||||
|
if core.verify:
|
||||||
|
self.verify = core.verify
|
||||||
|
if core.morestoryline:
|
||||||
|
self.morestoryline = True
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
""" 根据番号查询详细信息url
|
||||||
|
|
||||||
|
备份查询页面,预览图可能需要
|
||||||
|
"""
|
||||||
|
url = httprequest.get(number)
|
||||||
|
return url
|
||||||
|
|
||||||
|
def getHtml(self, url, type = None):
|
||||||
|
""" 访问网页
|
||||||
|
"""
|
||||||
|
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, verify=self.verify, return_type=type)
|
||||||
|
if '<title>404 Page Not Found' in resp \
|
||||||
|
or '<title>未找到页面' in resp \
|
||||||
|
or '404 Not Found' in resp \
|
||||||
|
or '<title>404' in resp \
|
||||||
|
or '<title>お探しの商品が見つかりません' in resp:
|
||||||
|
return 404
|
||||||
|
return resp
|
||||||
|
|
||||||
|
def getHtmlTree(self, url, type = None):
|
||||||
|
""" 访问网页,返回`etree`
|
||||||
|
"""
|
||||||
|
resp = self.getHtml(url, type)
|
||||||
|
if resp == 404:
|
||||||
|
return 404
|
||||||
|
ret = etree.fromstring(resp, etree.HTMLParser())
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def dictformat(self, htmltree):
|
||||||
|
try:
|
||||||
|
dic = {
|
||||||
|
'number': self.getNum(htmltree),
|
||||||
|
'title': self.getTitle(htmltree),
|
||||||
|
'studio': self.getStudio(htmltree),
|
||||||
|
'year': self.getYear(htmltree),
|
||||||
|
'outline': self.getOutline(htmltree),
|
||||||
|
'runtime': self.getRuntime(htmltree),
|
||||||
|
'director': self.getDirector(htmltree),
|
||||||
|
'actor': self.getActors(htmltree),
|
||||||
|
'release': self.getRelease(htmltree),
|
||||||
|
'cover': self.getCover(htmltree),
|
||||||
|
'cover_small': self.getSmallCover(htmltree),
|
||||||
|
'extrafanart': self.getExtrafanart(htmltree),
|
||||||
|
'trailer': self.getTrailer(htmltree),
|
||||||
|
'imagecut': self.imagecut,
|
||||||
|
'tag': self.getTags(htmltree),
|
||||||
|
'label': self.getLabel(htmltree),
|
||||||
|
'actor_photo': self.getActorPhoto(htmltree),
|
||||||
|
'website': self.detailurl,
|
||||||
|
'source': self.source,
|
||||||
|
'series': self.getSeries(htmltree),
|
||||||
|
'uncensored': self.getUncensored(htmltree),
|
||||||
|
'userrating': self.getUserRating(htmltree),
|
||||||
|
'uservotes': self.getUserVotes(htmltree)
|
||||||
|
}
|
||||||
|
dic = self.extradict(dic)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
dic = {"title": ""}
|
||||||
|
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
|
||||||
|
return js
|
||||||
|
|
||||||
|
def extradict(self, dic:dict):
|
||||||
|
""" 额外修改dict
|
||||||
|
"""
|
||||||
|
return dic
|
||||||
|
|
||||||
|
def getNum(self, htmltree):
|
||||||
|
""" 增加 strip 过滤
|
||||||
|
"""
|
||||||
|
return self.getTreeElement(htmltree, self.expr_number)
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_title).strip()
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_studio).strip(" ['']")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_studio2).strip(" ['']")
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getYear(self, htmltree):
|
||||||
|
""" year基本都是从release中解析的
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
release = self.getRelease(htmltree)
|
||||||
|
return str(re.findall('\d{4}', release)).strip(" ['']")
|
||||||
|
except:
|
||||||
|
return release
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi')
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_release).strip().replace('/','-')
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_outline).strip().replace("\n","")
|
||||||
|
|
||||||
|
def getDirector(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_director)
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_actor)
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_tags)
|
||||||
|
|
||||||
|
def getLabel(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_label).strip(" ['']")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_label2).strip(" ['']")
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getSeries(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_series).strip(" ['']")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_series2).strip(" ['']")
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_cover).strip(" ['']")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
return self.getTreeElement(htmltree, self.expr_cover2).strip(" ['']")
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getSmallCover(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_smallcover)
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_extrafanart)
|
||||||
|
|
||||||
|
def getTrailer(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_trailer)
|
||||||
|
|
||||||
|
def getActorPhoto(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_actorphoto)
|
||||||
|
|
||||||
|
def getUncensored(self, htmlree):
|
||||||
|
if self.expr_uncensored:
|
||||||
|
u = self.getTreeAll(htmlree, self.expr_uncensored)
|
||||||
|
return bool(u)
|
||||||
|
else:
|
||||||
|
return self.uncensored
|
||||||
|
|
||||||
|
def getUserRating(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_userrating)
|
||||||
|
|
||||||
|
def getUserVotes(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_uservotes)
|
||||||
|
|
||||||
|
def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
|
||||||
|
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
|
||||||
|
"""
|
||||||
|
return getTreeElement(tree, expr, index)
|
||||||
|
|
||||||
|
def getTreeAll(self, tree: html.HtmlElement, expr):
|
||||||
|
""" 根据表达式从`xmltree`中获取全部匹配值
|
||||||
|
"""
|
||||||
|
return getTreeAll(tree, expr)
|
||||||
@@ -1,16 +1,29 @@
|
|||||||
import sys
|
# -*- coding: utf-8 -*-
|
||||||
sys.path.append('../')
|
"""
|
||||||
|
此部分暂未修改
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import secrets
|
||||||
import builtins
|
import builtins
|
||||||
from ADC_function import *
|
from urllib.parse import urljoin
|
||||||
from lxml.html import fromstring
|
from lxml.html import fromstring
|
||||||
from multiprocessing.dummy import Pool as ThreadPool
|
from multiprocessing.dummy import Pool as ThreadPool
|
||||||
from difflib import SequenceMatcher
|
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session
|
||||||
from unicodedata import category
|
|
||||||
from number_parser import is_uncensored
|
|
||||||
|
|
||||||
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
|
# 舍弃 Amazon 源
|
||||||
|
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
|
||||||
|
|
||||||
G_mode_txt = ('顺序执行','线程池')
|
G_mode_txt = ('顺序执行','线程池')
|
||||||
|
def is_japanese(raw: str) -> bool:
|
||||||
|
"""
|
||||||
|
日语简单检测
|
||||||
|
"""
|
||||||
|
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE))
|
||||||
|
|
||||||
class noThread(object):
|
class noThread(object):
|
||||||
def map(self, fn, param):
|
def map(self, fn, param):
|
||||||
@@ -22,18 +35,14 @@ class noThread(object):
|
|||||||
|
|
||||||
|
|
||||||
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
|
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
|
||||||
def getStoryline(number, title, sites: list=None, 无码=None):
|
def getStoryline(number, title = None, sites: list=None, uncensored=None):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
conf = config.getInstance()
|
debug = False
|
||||||
if not conf.is_storyline():
|
storyine_sites = "1:avno1,4:airavwiki".split(',')
|
||||||
return ''
|
if uncensored:
|
||||||
debug = conf.debug() or conf.storyline_show() == 2
|
storyine_sites += "3:58avgo".split(',')
|
||||||
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
|
|
||||||
unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
|
|
||||||
if unc:
|
|
||||||
storyine_sites += conf.storyline_uncensored_site().split(',')
|
|
||||||
else:
|
else:
|
||||||
storyine_sites += conf.storyline_censored_site().split(',')
|
storyine_sites += "2:airav,5:xcity".split(',')
|
||||||
r_dup = set()
|
r_dup = set()
|
||||||
sort_sites = []
|
sort_sites = []
|
||||||
for s in storyine_sites:
|
for s in storyine_sites:
|
||||||
@@ -47,18 +56,11 @@ def getStoryline(number, title, sites: list=None, 无码=None):
|
|||||||
cores = min(len(apply_sites), os.cpu_count())
|
cores = min(len(apply_sites), os.cpu_count())
|
||||||
if cores == 0:
|
if cores == 0:
|
||||||
return ''
|
return ''
|
||||||
run_mode = 1 if conf.storyline_mode() > 0 else 0
|
run_mode = 1
|
||||||
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
|
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
|
||||||
results = pool.map(getStoryline_mp, mp_args)
|
results = pool.map(getStoryline_mp, mp_args)
|
||||||
sel = ''
|
sel = ''
|
||||||
if not debug and conf.storyline_show() == 0:
|
|
||||||
for value in results:
|
|
||||||
if isinstance(value, str) and len(value):
|
|
||||||
if not is_japanese(value):
|
|
||||||
return value
|
|
||||||
if not len(sel):
|
|
||||||
sel = value
|
|
||||||
return sel
|
|
||||||
# 以下debug结果输出会写入日志
|
# 以下debug结果输出会写入日志
|
||||||
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
|
||||||
sel_site = ''
|
sel_site = ''
|
||||||
@@ -72,7 +74,7 @@ def getStoryline(number, title, sites: list=None, 无码=None):
|
|||||||
for site, desc in zip(apply_sites, results):
|
for site, desc in zip(apply_sites, results):
|
||||||
sl = len(desc) if isinstance(desc, str) else 0
|
sl = len(desc) if isinstance(desc, str) else 0
|
||||||
s += f',[选中{site}字数:{sl}]' if site == sel_site else f',{site}字数:{sl}' if sl else f',{site}:空'
|
s += f',[选中{site}字数:{sl}]' if site == sel_site else f',{site}字数:{sl}' if sl else f',{site}:空'
|
||||||
# print(s)
|
print(s)
|
||||||
return sel
|
return sel
|
||||||
|
|
||||||
|
|
||||||
@@ -91,8 +93,8 @@ def getStoryline_mp(args):
|
|||||||
storyline = getStoryline_avno1(number, debug)
|
storyline = getStoryline_avno1(number, debug)
|
||||||
elif site == "xcity":
|
elif site == "xcity":
|
||||||
storyline = getStoryline_xcity(number, debug)
|
storyline = getStoryline_xcity(number, debug)
|
||||||
elif site == "amazon":
|
# elif site == "amazon":
|
||||||
storyline = getStoryline_amazon(title, number, debug)
|
# storyline = getStoryline_amazon(title, number, debug)
|
||||||
elif site == "58avgo":
|
elif site == "58avgo":
|
||||||
storyline = getStoryline_58avgo(number, debug)
|
storyline = getStoryline_58avgo(number, debug)
|
||||||
if not debug:
|
if not debug:
|
||||||
@@ -287,126 +289,3 @@ def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
|
|||||||
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
|
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
|
||||||
pass
|
pass
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def getStoryline_amazon(q_title, number, debug):
|
|
||||||
if not isinstance(q_title, str) or not len(q_title):
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
cookie, cookies_filepath = load_cookies('amazon.json')
|
|
||||||
url = "https://www.amazon.co.jp/s?k=" + q_title
|
|
||||||
res, session = get_html_session(url, cookies=cookie, return_type='session')
|
|
||||||
if not res:
|
|
||||||
raise ValueError("get_html_session() failed")
|
|
||||||
lx = fromstring(res.text)
|
|
||||||
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
|
|
||||||
if len(lks) and lks[0].startswith('/'):
|
|
||||||
res = session.get(urljoin(res.url, lks[0]))
|
|
||||||
cookie = None
|
|
||||||
lx = fromstring(res.text)
|
|
||||||
titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
|
|
||||||
urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
|
|
||||||
if not len(urls) or len(urls) != len(titles):
|
|
||||||
raise ValueError("titles not found")
|
|
||||||
idx = amazon_select_one(titles, q_title, number, debug)
|
|
||||||
if not isinstance(idx, int) or idx < 0:
|
|
||||||
raise ValueError("title and number not found")
|
|
||||||
furl = urljoin(res.url, urls[idx])
|
|
||||||
res = session.get(furl)
|
|
||||||
if not res.ok:
|
|
||||||
raise ValueError("browser.open_relative()) failed.")
|
|
||||||
lx = fromstring(res.text)
|
|
||||||
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
|
|
||||||
if len(lks) and lks[0].startswith('/'):
|
|
||||||
res = session.get(urljoin(res.url, lks[0]))
|
|
||||||
cookie = None
|
|
||||||
lx = fromstring(res.text)
|
|
||||||
p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
|
|
||||||
p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
|
|
||||||
ama_t = ' '.join(p1) + ' '.join(p2)
|
|
||||||
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
|
|
||||||
|
|
||||||
if cookie is None:
|
|
||||||
# 删除无效cookies,无论是用户创建还是自动创建,以避免持续故障
|
|
||||||
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
|
|
||||||
# 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
|
|
||||||
ama_save = Path.home() / ".local/share/mdc/amazon.json"
|
|
||||||
ama_save.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
ama_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
|
|
||||||
|
|
||||||
return ama_t
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
if debug:
|
|
||||||
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
|
|
||||||
pass
|
|
||||||
return None
|
|
||||||
|
|
||||||
# 查货架中DVD和蓝光商品中标题相似度高的
|
|
||||||
def amazon_select_one(a_titles, q_title, number, debug):
|
|
||||||
sel = -1
|
|
||||||
ratio = 0
|
|
||||||
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
|
|
||||||
for tloc, title in enumerate(a_titles):
|
|
||||||
if re.search(number, title, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
|
|
||||||
return tloc
|
|
||||||
if not re.search('DVD|Blu-ray', title, re.I):
|
|
||||||
continue
|
|
||||||
ama_t = str(re.sub('DVD|Blu-ray', "", title, re.I))
|
|
||||||
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
|
|
||||||
findlen = 0
|
|
||||||
lastpos = -1
|
|
||||||
for cloc, char in reversed(tuple(enumerate(ama_t))):
|
|
||||||
pos = que_t.rfind(char)
|
|
||||||
if lastpos >= 0:
|
|
||||||
pos_near = que_t[:lastpos].rfind(char)
|
|
||||||
if pos_near < 0:
|
|
||||||
findlen = 0
|
|
||||||
lastpos = -1
|
|
||||||
ama_t = ama_t[:cloc+1]
|
|
||||||
else:
|
|
||||||
pos = pos_near
|
|
||||||
if pos < 0:
|
|
||||||
if category(char) == 'Nd':
|
|
||||||
return -1
|
|
||||||
if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U):
|
|
||||||
return -1
|
|
||||||
ama_t = ama_t[:cloc]
|
|
||||||
findlen = 0
|
|
||||||
lastpos = -1
|
|
||||||
continue
|
|
||||||
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
|
|
||||||
findlen += 1
|
|
||||||
lastpos = pos
|
|
||||||
if findlen >= 4:
|
|
||||||
break
|
|
||||||
continue
|
|
||||||
findlen = 1
|
|
||||||
lastpos = pos
|
|
||||||
if findlen==0:
|
|
||||||
return -1
|
|
||||||
r = SequenceMatcher(None, ama_t, que_t).ratio()
|
|
||||||
if r > ratio:
|
|
||||||
sel = tloc
|
|
||||||
ratio = r
|
|
||||||
save_t_ = ama_t
|
|
||||||
if ratio > 0.999:
|
|
||||||
break
|
|
||||||
|
|
||||||
if ratio < 0.5:
|
|
||||||
return -1
|
|
||||||
|
|
||||||
if not debug:
|
|
||||||
# 目前采信相似度高于0.9的结果
|
|
||||||
return sel if ratio >= 0.9 else -1
|
|
||||||
|
|
||||||
# debug 模式下记录识别准确率日志
|
|
||||||
if ratio < 0.9:
|
|
||||||
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
|
|
||||||
with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
|
|
||||||
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
|
|
||||||
return -1
|
|
||||||
# 被采信的结果日志
|
|
||||||
with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
|
|
||||||
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
|
|
||||||
return sel
|
|
||||||
40
scrapinglib/tmdb.py
Normal file
40
scrapinglib/tmdb.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Tmdb(Parser):
|
||||||
|
"""
|
||||||
|
两种实现,带apikey与不带key
|
||||||
|
apikey
|
||||||
|
"""
|
||||||
|
source = 'tmdb'
|
||||||
|
imagecut = 0
|
||||||
|
apikey = None
|
||||||
|
|
||||||
|
expr_title = '//head/meta[@property="og:title"]'
|
||||||
|
expr_release = '//div/span[@class="release"]/text()'
|
||||||
|
expr_cover = '//head/meta[@property="og:image"]'
|
||||||
|
expr_outline = '//head/meta[@property="og:description"]'
|
||||||
|
|
||||||
|
# def search(self, number):
|
||||||
|
# self.detailurl = self.queryNumberUrl(number)
|
||||||
|
# detailpage = self.getHtml(self.detailurl)
|
||||||
|
|
||||||
|
def queryNumberUrl(self, number):
|
||||||
|
"""
|
||||||
|
TODO 区分 ID 与 名称
|
||||||
|
"""
|
||||||
|
id = number
|
||||||
|
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
|
||||||
|
return movieUrl
|
||||||
|
|
||||||
|
def getTitle(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_title).get('content')
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content')
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
return self.getTreeElement(htmltree, self.expr_outline).get('content')
|
||||||
31
scrapinglib/utils.py
Normal file
31
scrapinglib/utils.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from lxml.html import HtmlElement
|
||||||
|
|
||||||
|
def getTreeElement(tree: HtmlElement, expr, index=0):
|
||||||
|
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
|
||||||
|
:param tree (html.HtmlElement)
|
||||||
|
:param expr
|
||||||
|
:param index
|
||||||
|
"""
|
||||||
|
if expr == '':
|
||||||
|
return ''
|
||||||
|
result = tree.xpath(expr)
|
||||||
|
try:
|
||||||
|
return result[index]
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTreeAll(tree: HtmlElement, expr):
|
||||||
|
""" 根据表达式从`xmltree`中获取全部匹配值
|
||||||
|
:param tree (html.HtmlElement)
|
||||||
|
:param expr
|
||||||
|
:param index
|
||||||
|
"""
|
||||||
|
if expr == '':
|
||||||
|
return ''
|
||||||
|
result = tree.xpath(expr)
|
||||||
|
try:
|
||||||
|
return result
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
122
scrapinglib/xcity.py
Normal file
122
scrapinglib/xcity.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import re
|
||||||
|
import secrets
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from lxml import etree
|
||||||
|
from .httprequest import get_html_by_form
|
||||||
|
from .parser import Parser
|
||||||
|
|
||||||
|
|
||||||
|
class Xcity(Parser):
|
||||||
|
source = 'xcity'
|
||||||
|
|
||||||
|
expr_number = '//*[@id="hinban"]/text()'
|
||||||
|
expr_title = '//*[@id="program_detail_title"]/text()'
|
||||||
|
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
|
||||||
|
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
|
||||||
|
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
|
||||||
|
expr_label = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()'
|
||||||
|
expr_release = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()'
|
||||||
|
expr_tags = '//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()'
|
||||||
|
expr_cover = '//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href'
|
||||||
|
expr_director = '//*[@id="program_detail_director"]/text()'
|
||||||
|
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
|
||||||
|
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
|
||||||
|
|
||||||
|
def getStudio(self, htmltree):
|
||||||
|
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
|
||||||
|
|
||||||
|
def getRuntime(self, htmltree):
|
||||||
|
return self.getTreeAll(htmltree, self.expr_runtime)[1].strip()
|
||||||
|
|
||||||
|
def getRelease(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = self.getTreeElement(htmltree, self.expr_release, 1)
|
||||||
|
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getTags(self, htmltree):
|
||||||
|
result = self.getTreeAll(htmltree, self.expr_tags)
|
||||||
|
total = []
|
||||||
|
for i in result:
|
||||||
|
total.append(i.replace("\n","").replace("\t",""))
|
||||||
|
return total
|
||||||
|
|
||||||
|
def getCover(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = super().getCover(htmltree)
|
||||||
|
return 'https:' + result
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getDirector(self, htmltree):
|
||||||
|
try:
|
||||||
|
result = super().getDirector(htmltree).replace(u'\n','').replace(u'\t', '')
|
||||||
|
return result
|
||||||
|
except:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getOutline(self, htmltree):
|
||||||
|
if self.morestoryline:
|
||||||
|
from .storyline import getStoryline
|
||||||
|
return getStoryline(self.number, uncensored=False)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def getActors(self, htmltree):
|
||||||
|
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
||||||
|
t = []
|
||||||
|
for i in htmla:
|
||||||
|
t.append(i.text.strip())
|
||||||
|
return t
|
||||||
|
|
||||||
|
def getActorPhoto(self, htmltree):
|
||||||
|
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
|
||||||
|
t = {i.text.strip(): i['href'] for i in htmla}
|
||||||
|
o = {}
|
||||||
|
for k, v in t.items():
|
||||||
|
r = self.browser.open_relative(v)
|
||||||
|
if not r.ok:
|
||||||
|
continue
|
||||||
|
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
|
||||||
|
if 'noimage.gif' in pic['src']:
|
||||||
|
continue
|
||||||
|
o[k] = urljoin(self.browser.url, pic['src'])
|
||||||
|
return o
|
||||||
|
|
||||||
|
def getExtrafanart(self, htmltree):
|
||||||
|
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
|
||||||
|
html = html_pather.search(self.detail_page)
|
||||||
|
if html:
|
||||||
|
html = html.group()
|
||||||
|
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
|
||||||
|
extrafanart_imgs = extrafanart_pather.findall(html)
|
||||||
|
if extrafanart_imgs:
|
||||||
|
s = []
|
||||||
|
for urli in extrafanart_imgs:
|
||||||
|
urli = 'https:' + urli.replace('/scene/small', '')
|
||||||
|
s.append(urli)
|
||||||
|
return s
|
||||||
|
return ''
|
||||||
|
|
||||||
|
def open_by_browser(self, number):
|
||||||
|
xcity_number = number.replace('-','')
|
||||||
|
query_result, browser = get_html_by_form(
|
||||||
|
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
|
||||||
|
fields = {'q' : xcity_number.lower()},
|
||||||
|
return_type = 'browser')
|
||||||
|
if not query_result or not query_result.ok:
|
||||||
|
raise ValueError("xcity.py: page not found")
|
||||||
|
result = browser.follow_link(browser.links('avod\/detail')[0])
|
||||||
|
if not result.ok:
|
||||||
|
raise ValueError("xcity.py: detail page not found")
|
||||||
|
return str(browser.page), browser
|
||||||
|
|
||||||
|
def search(self, number):
|
||||||
|
self.number = number
|
||||||
|
self.detail_page, self.browser = self.open_by_browser(number)
|
||||||
|
self.detailurl = self.browser.url
|
||||||
|
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
|
||||||
|
result = self.dictformat(lx)
|
||||||
|
return result
|
||||||
Reference in New Issue
Block a user