Merge pull request #817 from Suwmlee/dev

refactor webcrawler
This commit is contained in:
Yoshiko2
2022-06-14 20:55:51 +08:00
committed by GitHub
42 changed files with 2522 additions and 3026 deletions

View File

@@ -1,227 +0,0 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup#need install
from ADC_function import *
from WebCrawler import javbus
'''
API
注册https://www.airav.wiki/api/auth/signup
设置https://www.airav.wiki/api/get_web_settings
搜索https://www.airav.wiki/api/video/list?lng=zh-CN&search=
搜索https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search=
'''
host = 'https://www.airav.wiki'
# airav这个网站没有演员图片所以直接使用javbus的图
def getActorPhoto(javbus_json):
result = javbus_json.get('actor_photo')
if isinstance(result, dict) and len(result):
return result
return ''
def getTitle(htmlcode): #获取标题
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
def getStudio(htmlcode, javbus_json): #获取厂商 已修改
# javbus如果有数据以它为准
result = javbus_json.get('studio')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode,etree.HTMLParser())
return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
def getYear(htmlcode, javbus_json): #获取年份
result = javbus_json.get('year')
if isinstance(result, str) and len(result):
return result
release = getRelease(htmlcode, javbus_json)
if len(release) != len('2000-01-01'):
return ''
return release[:4]
def getCover(htmlcode, javbus_json): #获取封面图片
result = javbus_json.get('cover')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
def getRelease(htmlcode, javbus_json): #获取出版日期
result = javbus_json.get('release')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
except:
return ''
return result
def getRuntime(javbus_json): #获取播放时长
result = javbus_json.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
# airav女优数据库较多日文汉字姓名javbus较多日语假名因此airav优先
def getActor(htmlcode, javbus_json): #获取女优
b=[]
html = etree.fromstring(htmlcode, etree.HTMLParser())
a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
for v in a:
v = v.strip()
if len(v):
b.append(v)
if len(b):
return b
result = javbus_json.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getNum(htmlcode, javbus_json): #获取番号
result = javbus_json.get('number')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('^\[(.*?)]', title)[0])
return result
def getDirector(javbus_json): #获取导演 已修改
result = javbus_json.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getOutline(htmlcode): #获取概述
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
return result
except:
return ''
def getSerise(javbus_json): #获取系列 已修改
result = javbus_json.get('series')
if isinstance(result, str) and len(result):
return result
return ''
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
a = x[0].find_all('a')
for i in a:
tag.append(i.get_text())
return tag
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def search(keyword): #搜索,返回结果
result = []
page = 1
while page > 0:
# search_result = {"offset": 0,"count": 4,"result": [
# {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
# "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"},
# {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜",
# "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"},
# {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希",
# "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"},
# {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
# "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"}
# ],"status": "ok"}
search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page))
try:
json_data = json.loads(search_result)
except json.decoder.JSONDecodeError:
# print("[-]Json decoder error!")
return []
result_offset = int(json_data["offset"])
result_count = int(json_data["count"])
result_size = len(json_data["result"])
if result_count <= 0 or result_size <= 0:
return result
elif result_count > result_offset + result_size: #请求下一页内容
result.extend(json_data["result"])
page += 1
elif result_count == result_offset + result_size: #请求最后一页内容
result.extend(json_data["result"])
page = 0
else:
page = 0
return result
def main(number):
try:
try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
javbus_json = json.loads(javbus.main(number))
except:
# print(number)
pass
dic = {
# 标题可使用airav
'title': getTitle(htmlcode),
# 制作商先找javbus如果没有再找本站
'studio': getStudio(htmlcode, javbus_json),
# 年份先试javbus如果没有再找本站
'year': getYear(htmlcode, javbus_json),
# 简介 使用 airav
'outline': getOutline(htmlcode),
# 使用javbus
'runtime': getRuntime(javbus_json),
# 导演 使用javbus
'director': getDirector(javbus_json),
# 演员 先试airav
'actor': getActor(htmlcode, javbus_json),
# 发售日先试javbus
'release': getRelease(htmlcode, javbus_json),
# 番号使用javbus
'number': getNum(htmlcode, javbus_json),
# 封面链接 使用javbus
'cover': getCover(htmlcode, javbus_json),
# 剧照获取
'extrafanart': getExtrafanart(htmlcode),
'imagecut': 1,
# 使用 airav
'tag': getTag(htmlcode),
# 使用javbus
'label': getSerise(javbus_json),
'actor_photo': getActorPhoto(javbus_json),
'website': 'https://www.airav.wiki/video/' + number,
'source': 'airav.py',
# 使用javbus
'series': getSerise(javbus_json)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
print(main('ADV-R0624')) # javbus页面返回404, airav有数据
print(main('ADN-188')) # 一人
print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名airav采用日文汉字

View File

@@ -1,86 +0,0 @@
import sys
sys.path.append('..')
from ADC_function import *
from WebCrawler.storyline import getStoryline
from WebCrawler.crawler import *
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getActorPhoto(html):
a = html.xpath('//a[@class="avatar-box"]')
d = {}
for i in a:
l = i.find('.//img').attrib['src']
t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d
def getActor(html):
a = html.xpath('//a[@class="avatar-box"]')
d = []
for i in a:
d.append(i.find('span').text)
return d
def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
def getTag(html):
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x[2:]] if len(x) > 2 else []
def main(number):
html = get_html('https://tellme.pw/avsox')
site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
a = get_html(site + '/cn/search/' + number)
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
detail = get_html("https:" + result1)
lx = etree.fromstring(detail, etree.HTMLParser())
avsox_crawler2 = Crawler(a)
avsox_crawler = Crawler(detail)
try:
new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
if new_number.upper() != number.upper():
raise ValueError('number not found')
title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
dic = {
'actor': getActor(lx),
'title': title,
'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
'outline': getStoryline(number, title),
'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
'director': '', #
'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
'number': new_number,
'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
#'cover_small' : getCover_small(html),
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
'imagecut': 3,
'tag': getTag(lx),
'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
'actor_photo': getActorPhoto(lx),
'website': "https:" + result1,
'source': 'avsox.py',
'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
}
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == "__main__":
print(main('012717_472'))
print(main('1')) # got fake result raise 'number not found'

View File

@@ -1,133 +0,0 @@
import sys
sys.path.append('../')
from lxml import html
from ADC_function import *
from WebCrawler.storyline import getStoryline
G_SITE = 'https://www.caribbeancom.com'
def main(number: str) -> json:
try:
url = f'{G_SITE}/moviepages/{number}/index.html'
result, session = get_html_session(url, return_type='session')
htmlcode = result.content.decode('euc-jp')
if not result or not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found")
lx = html.fromstring(htmlcode)
title = get_title(lx)
dic = {
'title': title,
'studio': '加勒比',
'year': get_year(lx),
'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx),
'director': '',
'actor': get_actor(lx),
'release': get_release(lx),
'number': number,
'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
'tag': get_tag(lx),
'extrafanart': get_extrafanart(lx),
'label': get_series(lx),
'imagecut': 1,
'website': f'{G_SITE}/moviepages/{number}/index.html',
'source': 'carib.py',
'series': get_series(lx),
'无码': True
}
if config.getInstance().download_actor_photo_for_kodi():
dic['actor_photo'] = get_actor_photo(lx, session)
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title, 无码=True)
if len(g):
return g
return o
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
def get_actor(lx: html.HtmlElement):
r = []
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
for act in actors:
if str(act) != '':
r.append(act)
return r
def get_tag(lx: html.HtmlElement) -> str:
genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
return genres
def get_extrafanart(lx: html.HtmlElement) -> str:
r = []
genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
for g in genres:
jpg = str(g)
if '/member/' in jpg:
break
else:
r.append('https://www.caribbeancom.com' + jpg)
return r
def get_series(lx: html.HtmlElement) -> str:
try:
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
except:
return ''
def get_runtime(lx: html.HtmlElement) -> str:
return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
def get_actor_photo(lx, session):
htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
t = {}
for name, a in zip(names, htmla):
if name.strip() == '':
continue
p = {name.strip(): a.attrib['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = session.get(urljoin(G_SITE, v))
if not r.ok:
continue
html = r.text
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
p = {k: urljoin(r.url, cssBGjpgs[0])}
o.update(p)
return o
if __name__ == "__main__":
print(main("070116-197")) # actor have photo
print(main("041721-001"))
print(main("080520-001"))

View File

@@ -1,28 +0,0 @@
from lxml import etree
class Crawler:
def __init__(self,htmlcode):
self.html = etree.HTML(htmlcode)
def getString(self,_xpath):
if _xpath == "":
return ""
result = self.html.xpath(_xpath)
try:
return result[0]
except:
return ""
def getStrings(self,_xpath):
result = self.html.xpath(_xpath)
try:
return result
except:
return ""
def getOutline(self,_xpath):
result = self.html.xpath(_xpath)
try:
return "\n".join(result)
except:
return ""

View File

@@ -1,185 +0,0 @@
import re
from lxml import etree
import json
import sys
sys.path.append('../')
from ADC_function import *
def getTitle(html):
result = str(html.xpath('/html/head/title/text()')[0])
result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
result = result.replace('【HD版】', '')
return result
def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
result1 = ''
return result1
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
d={}
for i in a:
p={i:''}
d.update(p)
return d
def getStudio(html):
try:
try:
result = html.xpath('//th[contains(text(),"商标名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getYear(getRelease):
try:
result = str(re.search('\d{4}', getRelease).group())
return result
except:
return getRelease
def getRelease(html):
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','')
def getTag(html):
try:
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result
except:
return ''
def getCover_small(a, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(html):
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
return result.replace('.webp', '.jpg')
def getDirector(html):
try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result
def getOutline(html):
total = []
result = html.xpath('//*[@class="work_parts_area"]/p/text()')
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
#
def getExtrafanart(html):
try:
result = []
for i in html.xpath('//*[@id="work_left"]/div/div/div[1]/div/@data-src'):
result.append("https:" + i)
except:
result = ''
return result
def main(number):
try:
if "RJ" in number or "VJ" in number:
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN', cookies={'locale': 'zh-cn'})
html = etree.fromstring(htmlcode, etree.HTMLParser())
else:
htmlcode = get_html(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})
html = etree.HTML(htmlcode)
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
html = etree.HTML(get_html(
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
if "" in number:
number = number.replace("","")
elif "" in number:
number = number.replace("","")
html = etree.HTML(get_html(
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
html = etree.HTML(get_html(
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
a = search_result[0]
html = etree.HTML(get_html(a,cookies={'locale': 'zh-cn'}))
number = str(re.findall("\wJ\w+",a)).strip(" [']")
dic = {
'actor': getStudio(html),
'title': getTitle(html),
'studio': getStudio(html),
'outline': getOutline(html),
'runtime': '',
'director': getDirector(html),
'release': getRelease(html),
'number': number,
'cover': 'https:' + getCover(html),
'cover_small': '',
'imagecut': 4,
'tag': getTag(html),
'label': getLabel(html),
'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(html),
'extrafanart':getExtrafanart(html),
'allow_number_change':True,
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.getInstance().set_override("debug_mode:switch=1")
print(main('牝教師4穢された教壇 「生意気ドジっ娘女教師・美結高飛車ハメ堕ち2濁金」'))
print(main('RJ329607'))

View File

@@ -1,205 +0,0 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
from urllib.parse import urlencode
from ADC_function import *
from WebCrawler.crawler import *
class fanzaCrawler(Crawler):
def getFanzaString(self,string):
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getFanzaStrings(self, string):
result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
if len(result1) > 0:
return result1
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2
def getRelease(fanza_Crawler):
result = fanza_Crawler.getFanzaString('発売日:')
if result == '' or result == '----':
result = fanza_Crawler.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getCover(html, number):
cover_number = number
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f")
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# (TODO) handle more edge case
# print(html)
# raise exception here, same behavior as before
# people's major requirement is fetching the picture
raise ValueError("can not find image")
return result
def getOutline(html):
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
except:
# (TODO) handle more edge case
# print(html)
return ""
return result
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for img_url in extrafanart_imgs:
img_urls = img_url.rsplit('-', 1)
img_url = img_urls[0] + 'jp-' + img_urls[1]
s.append(img_url)
return s
return ''
def main(number):
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
]
chosen_url = ""
fanza_Crawler = ''
for url in fanza_urls:
chosen_url = url + fanza_search_number
htmlcode = get_html(
"https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format(
urlencode({"rurl": chosen_url})
)
)
fanza_Crawler = fanzaCrawler(htmlcode)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
return json.dumps({"title": "",})
try:
# for some old page, the input number does not match the page
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
fanza_hinban = fanza_Crawler.getFanzaString('品番:')
out_num = fanza_hinban
number_lo = number.lower()
html = etree.fromstring(htmlcode, etree.HTMLParser())
if (re.sub('-|_', '', number_lo) == fanza_hinban or
number_lo.replace('-', '00') == fanza_hinban or
number_lo.replace('-', '') + 'so' == fanza_hinban
):
out_num = number
director = fanza_Crawler.getFanzaString('監督:')
if "anime" in chosen_url:
director = ""
actor = fanza_Crawler.getStrings("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()")
if "anime" in chosen_url:
actor = ""
# ----
series = fanza_Crawler.getFanzaString('シリーズ:')
if series == "----":
series = ""
label = fanza_Crawler.getFanzaString('レーベル')
if label == "----":
label = ""
data = {
"title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
"studio": fanza_Crawler.getFanzaString('メーカー'),
"outline": getOutline(html),
"runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
"director": director,
"actor": actor,
"release": getRelease(fanza_Crawler),
"number": out_num,
"cover": getCover(html, fanza_hinban),
"imagecut": 1,
"tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
"extrafanart": getExtrafanart(htmlcode),
"label": label,
"year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
"series": series,
}
except Exception as e:
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) # .encode('UTF-8')
return js
def main_htmlcode(number):
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
]
chosen_url = ""
for url in fanza_urls:
chosen_url = url + fanza_search_number
htmlcode = get_html(chosen_url)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
return json.dumps({"title": "",})
return htmlcode
if __name__ == "__main__":
# print(main("DV-1562"))
# print(main("96fad1217"))
print(main("AES-002"))
print(main("MIAA-391"))
print(main("OBA-326"))

View File

@@ -1,80 +0,0 @@
import sys
sys.path.append('../')
import re
import json
import config
import ADC_function
from WebCrawler.crawler import *
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTrailer(htmlcode, number):
video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'')
video = video_pather.findall(htmlcode)
if video:
try:
video_url = video[0].replace('\'', '')
video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + number + '/sample?key=' + video_url
url_json = eval(ADC_function.get_html(video_url))['path'].replace('\\', '')
return url_json
except:
return ''
else:
return ''
def main(number):
try:
number = number.replace('FC2-', '').replace('fc2-', '')
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
fc2_crawler = Crawler(htmlcode2)
actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
if actor == "":
actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
strip(" ['販売日 : ']").replace('/','-')
dic = {
'title': fc2_crawler.getString('/html/head/title/text()'),
'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'year': re.findall('\d{4}',release)[0],
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'actor': actor,
'release': release,
'number': 'FC2-' + number,
'label': '',
'cover': cover,
'thumb': cover,
'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2, number),
'imagecut': 0,
'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
'actor_photo': '',
'website': 'https://adult.contents.fc2.com/article/' + number + '/',
'source': 'https://adult.contents.fc2.com/article/' + number + '/',
'series': '',
}
except Exception as e:
if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
config.getInstance().set_override("debug_mode:switch=1")
#print(main('FC2-2182382'))
#print(main('FC2-607854'))
print(main('FC2-2787433'))

View File

@@ -1,96 +0,0 @@
import sys
sys.path.append('../')
from lxml import etree#need install
import json
import ADC_function
def getTitle_fc2com(htmlcode): #获取标题
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']")
return result
def getActor_fc2com(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[5]/a/text()')).strip(" ['']")
return result
except:
return ''
def getStudio_fc2com(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
return result
except:
return ''
def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']")
num = title.split(' ')[0]
if num.startswith('FC2') != True:
num = ''
return num
def getRelease_fc2com(htmlcode2): #
return ''
def getCover_fc2com(htmlcode2): #获取img #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']")
imgUrl = imgUrl.replace('../','https://fc2club.net/')
return imgUrl
def getTag_fc2com(htmlcode): #获取tag
html = etree.fromstring(htmlcode,etree.HTMLParser())
a = html.xpath('//*[@class="show-top-grids"]/div[1]/h5[4]/a')
tag = []
for i in range(len(a)):
tag.append(str(a[i].xpath('text()')).strip("['']"))
return tag
def getYear_fc2com(release):
return ''
def getExtrafanart(htmlcode): # 获取剧照
html = etree.fromstring(htmlcode, etree.HTMLParser())
imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']")
imgUrl = imgUrl.replace('../','https://fc2club.net/')
return imgUrl
def getTrailer(htmlcode):
return ''
def main(number):
try:
number = number.replace('FC2-', '').replace('fc2-', '')
webUrl = 'https://fc2club.net/html/FC2-' + number + '.html'
htmlcode2 = ADC_function.get_html(webUrl)
actor = getActor_fc2com(htmlcode2)
if getActor_fc2com(htmlcode2) == '':
actor = 'FC2系列'
dic = {
'title': getTitle_fc2com(htmlcode2),
'studio': getStudio_fc2com(htmlcode2),
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': '',
'director': getStudio_fc2com(htmlcode2),
'actor': actor,
'release': getRelease_fc2com(htmlcode2),
'number': 'FC2-' + number,
'label': '',
'cover': getCover_fc2com(htmlcode2),
'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2),
'imagecut': 0,
'tag': getTag_fc2com(htmlcode2),
'actor_photo': '',
'website': 'https://fc2club.net/html/FC2-' + number + '.html/',
'source': 'https://fc2club.net/html/FC2-' + number + '.html/',
'series': '',
}
except Exception as e:
if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
print(main('FC2-402422'))

View File

@@ -1,86 +0,0 @@
import sys
sys.path.append('../')
from WebCrawler.crawler import *
from ADC_function import *
def main(number):
save_cookies = False
cookie_filename = 'gcolle.json'
try:
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
session = get_html_session(cookies=gcolle_cooikes)
number = number.upper().replace('GCOLLE-','')
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
gcolle_crawler = Crawler(htmlcode)
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
if r18_continue and r18_continue.startswith('http'):
htmlcode = session.get(r18_continue).text
gcolle_crawler = Crawler(htmlcode)
save_cookies = True
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
if number != number_html:
raise Exception('[-]gcolle.py: number not match')
if save_cookies:
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
cookies_save.parent.mkdir(parents=True, exist_ok=True)
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
# get extrafanart url
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
else:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
dic = {
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
"runtime": '',
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"number": "GCOLLE-" + str(number_html),
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"trailer": '',
"actor_photo":'',
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
"extrafanart":extrafanart,
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
"source": 'gcolle.py',
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
'无码': False,
}
# for k,v in dic.items():
# if k == 'outline':
# print(k,len(v))
# else:
# print(k,v)
# print('===============================================================')
except Exception as e:
dic = {'title':''}
if config.getInstance().debug():
print(e)
return dic
if __name__ == '__main__':
from pprint import pprint
config.getInstance().set_override("debug_mode:switch=1")
pprint(main('840724'))
pprint(main('840386'))
pprint(main('838671'))
pprint(main('814179'))
pprint(main('834255'))
pprint(main('814179'))

View File

@@ -1,133 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.crawler import *
import re
import time
from urllib.parse import quote
JSON_HEADERS = {"Referer": "https://dl.getchu.com/"}
COOKIES_DL = {"adult_check_flag": "1"}
COOKIES_WWW = {'getchu_adalt_flag': 'getchu.com'}
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_WWW_URL = 'http://www.getchu.com/soft.phtml?id=_WORD_'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
def get_dl_getchu(number):
if "item" in number or 'GETCHU' in number.upper():
number = re.findall('\d+',number)[0]
else:
htmlcode = get_html(GETCHU_DL_SEARCH_URL.replace("_WORD_", number),
json_headers=JSON_HEADERS, cookies=COOKIES_DL)
getchu = Crawler(htmlcode)
url = getchu.getString(
'/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
if url == "":
return None
number = re.findall('\d+', url)[0]
htmlcode = get_html(GETCHU_DL_URL.replace("_WORD_", number), json_headers=JSON_HEADERS, cookies=COOKIES_DL)
getchu = Crawler(htmlcode)
dic = {
"title": getchu.getString("//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"),
"cover": "https://dl.getchu.com" + getchu.getString("//td[contains(@bgcolor,'#ffffff')]/img/@src"),
"director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()").strip(),
"studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
"actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
"label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
"runtime": str(re.findall('\d+', str(getchu.getString(
"//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()")))).strip(" ['']"),
"release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-"),
"tag": getchu.getStrings("//td[contains(text(),'趣向')]/following-sibling::td/a/text()"),
"outline": getchu.getStrings("//*[contains(text(),'作品内容')]/following-sibling::td/text()"),
"extrafanart": getchu.getStrings("//td[contains(@style,'background-color: #444444;')]/a/@href"),
"series": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
"number": 'GETCHU-' + re.findall('\d+',number)[0],
"imagecut": 4,
"year": str(re.findall('\d{4}', str(getchu.getString(
"//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-")))).strip(" ['']"),
"actor_photo": "",
"website": "https://dl.getchu.com/i/" + number,
"source": "getchu.py",
"allow_number_change": True,
}
extrafanart = []
for i in dic['extrafanart']:
i = "https://dl.getchu.com" + i
extrafanart.append(i)
dic['extrafanart'] = extrafanart
time.sleep(1)
return dic
def get_www_getchu(number):
number = quote(number, encoding="euc_jp")
getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if url2 == '':
getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if url2 == "":
return None
url2 = url2.replace('../', 'http://www.getchu.com/')
getchu = Crawler(get_html(url2, cookies=COOKIES_WWW))
dic = {
"title": getchu.getString('//*[@id="soft-title"]/text()').strip(),
"cover": "http://www.getchu.com" + getchu.getString(
"/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'),
"director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
"studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
"actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
"label": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(),
"runtime": '',
"release": getchu.getString("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-").strip(),
"tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"),
"outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"),
"extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"),
"series": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(),
"number": 'GETCHU-' + re.findall('\d+', url2.replace("http://www.getchu.com/soft.phtml?id=", ""))[0],
"imagecut": 0,
"year": str(re.findall('\d{4}', str(getchu.getString(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"),
"actor_photo": "",
"website": url2,
"headers": {'referer': url2},
"source": "getchu.py",
"allow_number_change": True,
}
extrafanart = []
for i in dic['extrafanart']:
i = "http://www.getchu.com" + i.replace("./", '/')
if 'jpg' in i:
extrafanart.append(i)
dic['extrafanart'] = extrafanart
time.sleep(1)
return dic
def main(number):
number = number.replace("-C", "")
dic = {}
if "item" in number:
sort = ["get_dl_getchu(number)", "get_www_getchu(number)"]
else:
sort = ["get_www_getchu(number)", "get_dl_getchu(number)"]
for i in sort:
dic = eval(i)
if dic != None:
break
if dic == None:
return {"title" : ""}
outline = ''
_list = dic['outline']
for i in _list:
outline = outline + i
dic['outline'] = outline
result = json.dumps(dic,ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )
return result
if __name__ == '__main__':
test = []
for i in test:
print(i)
print(main(i))

View File

@@ -1,185 +0,0 @@
import sys
sys.path.append('../')
import json
from bs4 import BeautifulSoup
from lxml import html
from ADC_function import post_html
import re
def main(number: str) -> json:
try:
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
except:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
if "/video/" in result.url:
data = parse_info(soup)
dic = {
"title": get_title(lx),
"year": get_year(data),
"outline": get_outline(lx),
"director": "",
"cover": get_cover(lx),
"imagecut": 1,
"trailer": get_trailer(result.text),
"extrafanart": get_extrafanart(result.text),
"actor_photo": "",
"website": result.url,
"source": "jav321.py",
**data,
}
else:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
def parse_info(soup: BeautifulSoup) -> dict:
data = soup.select_one("div.row > div.col-md-9")
if data:
dd = str(data).split("<br/>")
data_dic = {}
for d in dd:
data_dic[get_bold_text(h=d)] = d
return {
"actor": get_actor(data_dic),
"label": get_label(data_dic),
"studio": get_studio(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic).upper(),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic).replace(" minutes", ""),
"series": get_series(data_dic),
}
else:
return {"title": ""}
def get_bold_text(h: str) -> str:
soup = BeautifulSoup(h, "html.parser")
if soup.b:
return soup.b.text
else:
return "UNKNOWN_TAG"
def get_anchor_info(h: str) -> str:
result = []
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
for d in data:
result.append(d.text)
return ",".join(result)
def get_text_info(h: str) -> str:
return h.split(": ")[1]
def get_trailer(html) -> str:
videourl_pather = re.compile(r'<source src=\"(.*?)\"')
videourl = videourl_pather.findall(html)
if videourl:
url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp')
return url
else:
return ''
def get_extrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def get_cover(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
def get_series2(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
def get_actor(data: hash) -> str:
if "出演者" in data:
return get_anchor_info(data["出演者"])
else:
return ""
def get_label(data: hash) -> str:
if "メーカー" in data:
return get_anchor_info(data["メーカー"])
else:
return ""
def get_tag(data: hash) -> str:
if "ジャンル" in data:
return get_anchor_info(data["ジャンル"])
else:
return ""
def get_studio(data: hash) -> str:
if "メーカー" in data:
return get_anchor_info(data["メーカー"])
else:
return ""
def get_number(data: hash) -> str:
if "品番" in data:
return get_text_info(data["品番"])
else:
return ""
def get_release(data: hash) -> str:
if "配信開始日" in data:
return get_text_info(data["配信開始日"])
else:
return ""
def get_runtime(data: hash) -> str:
if "収録時間" in data:
return get_text_info(data["収録時間"])
else:
return ""
def get_year(data: hash) -> str:
if "release" in data:
return data["release"][:4]
else:
return ""
def get_series(data: hash) -> str:
if "シリーズ" in data:
return get_anchor_info(data["シリーズ"])
else:
return ""
if __name__ == "__main__":
print(main("jul-404"))

View File

@@ -1,184 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.storyline import getStoryline
import inspect
def getActorPhoto(html):
actors = html.xpath('//div[@class="star-name"]/../a/img')
d = {}
for i in actors:
p = i.attrib['src']
if "nowprinting.gif" in p:
continue
t = i.attrib['title']
d[t] = urljoin("https://www.javbus.com", p)
return d
def getTitle(html): #获取标题
title = str(html.xpath('/html/head/title/text()')[0])
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
return title
def getStudioJa(html):
x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getStudio(html): #获取厂商
x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getYear(html): #获取年份
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
return result[:4] if len(result)>=len('2000-01-01') else ''
def getCover(html): #获取封面链接
image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
return urljoin("https://www.javbus.com", image)
def getRelease(html): #获取出版日期
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getRuntime(html): #获取分钟 已修改
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result
def getActor(html): #获取女优
b=[]
actors = html.xpath('//div[@class="star-name"]/a')
for i in actors:
b.append(i.attrib['title'])
return b
def getNum(html): #获取番号
kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return kwdlist[0]
def getDirectorJa(html):
x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getDirector(html): #获取导演
x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string)
return result
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
return getStoryline(number,title, 无码=uncensored)
def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getSerise(html): #获取系列
x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getTag(html): # 获取标签
klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return klist[1:]
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getUncensored(html):
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
return bool(x)
def main_uncensored(number):
w_number = number.replace('.', '-')
htmlcode = get_html('https://www.javbus.red/' + w_number)
if "<title>404 Page Not Found" in htmlcode:
return {"title": ""}
lx = etree.fromstring(htmlcode, etree.HTMLParser())
title = getTitle(lx)
dic = {
'title': title,
'studio': getStudioJa(lx),
'year': getYear(lx),
'outline': getOutline(w_number, title, True),
'runtime': getRuntime(lx),
'director': getDirectorJa(lx),
'actor': getActor(lx),
'release': getRelease(lx),
'number': getNum(lx),
'cover': getCover(lx),
'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
'label': getSeriseJa(lx),
'imagecut': 0,
'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.red/' + w_number,
'source': 'javbus.py',
'series': getSeriseJa(lx),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
def main(number):
try:
try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
'dmmbus.fun', 'dmmsee.fun',
'fanbus.us',
'seedmm.fun',
]) + "/"
try:
htmlcode = get_html(url + number)
except:
htmlcode = get_html('https://www.javbus.com/' + number)
if "<title>404 Page Not Found" in htmlcode:
return {"title": ""}
lx = etree.fromstring(htmlcode,etree.HTMLParser())
title = getTitle(lx)
dic = {
'title': title,
'studio': getStudio(lx),
'year': getYear(lx),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'actor': getActor(lx),
'release': getRelease(lx),
'number': getNum(lx),
'cover': getCover(lx),
'imagecut': 1,
'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
'label': getSerise(lx),
'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
'series': getSerise(lx),
'无码': getUncensored(lx)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
return main_uncensored(number)
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == "__main__" :
config.getInstance().set_override("storyline:switch=0")
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
print(main('STAR-438'))
print(main('ABP-960'))
print(main('ADV-R0624')) # 404
print(main('MMNT-010'))
print(main('ipx-292'))
print(main('CEMD-011'))
print(main('CJOD-278'))
print(main('BrazzersExxtra.21.02.01'))
print(main('100221_001'))
print(main('AVSW-061'))

View File

@@ -1,321 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.storyline import getStoryline
def getTitle(html):
browser_title = str(html.xpath("/html/head/title/text()")[0])
return browser_title[:browser_title.find(' | JavDB')].strip()
def getActor(html):
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
r = []
idx = 0
actor_gendor = config.getInstance().actor_gender()
if not actor_gendor in ['female','male','both','all']:
actor_gendor = 'female'
for act in actors:
if((actor_gendor == 'all')
or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male'])
or (actor_gendor == 'female' and genders[idx] == 'symbol female')
or (actor_gendor == 'male' and genders[idx] == 'symbol male')):
r.append(act)
idx = idx + 1
return r
def getaphoto(url, session):
html_page = session.get(url).text
img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page)
return img_url[0] if img_url else ''
def getActorPhoto(html, javdb_site, session):
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
if not actorall:
return {}
a = getActor(html)
actor_photo = {}
if not session:
session = get_html_session()
for i in actorall:
x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A)
if not len(x) or not len(x[0]) or i.text not in a:
continue
actor_id = x[0]
pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg"
if not session.head(pic_url).ok:
pic_url = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session)
if len(pic_url):
actor_photo[i.text] = pic_url
return actor_photo
def getStudio(a, html):
patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
pianshang = patherr.findall(a)
if pianshang:
result = pianshang[0].strip()
if len(result):
return result
# 以卖家作为工作室
try:
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
except:
result = ''
return result
def getRuntime(html):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(html):
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+')
def getYear(getRelease):
patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)\-.*?</span>')
dates = patherr.findall(getRelease)
if dates:
result = dates[0]
else:
result = ''
return result
def getRelease(a):
patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)</span>')
dates = patherr.findall(a)
if dates:
result = dates[0]
else:
result = ''
return result
def getTag(html):
try:
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
return result
except:
result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
return result
def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
try:
result = html.xpath("//*[contains(@class,'movie-list')]/div/a/div[contains(@class, 'cover')]/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except: # 2020.7.17 Repair Cover Url crawl
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except:
result = html.xpath("//div[@class='item-image']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getTrailer(htmlcode): # 获取预告片
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
video = video_pather.findall(htmlcode)
# 加上数组判空
if video and video[0] != "":
if not 'https:' in video[0]:
video_url = 'https:' + video[0]
else:
video_url = video[0]
else:
video_url = ''
return video_url
def getExtrafanart(html): # 获取剧照
result = []
try:
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
except:
pass
return result
def getCover(html):
try:
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
return result
def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
return getStoryline(number, title, 无码=uncensored)
def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getUserRating(html):
try:
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0]), int(v[0][1])
except:
return
def getUncensored(html):
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
' or contains(@href,"/tags/western?")]')
return bool(x)
def main(number):
# javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
# 如果无.json文件或者超过有效期则随机选择一个站点。
javdb_sites = config.getInstance().javdb_sites().split(',')
debug = config.getInstance().debug()
for i in javdb_sites:
javdb_sites[javdb_sites.index(i)] = "javdb" + i
javdb_sites.append("javdb")
try:
# if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
# pass
# else:
# number = number.upper()
number = number.upper()
javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
# 不加载过期的cookiejavdb登录界面显示为7天免登录故假定cookie有效期为7天
has_json = False
for cj in javdb_sites:
javdb_site = cj
cookie_json = javdb_site + '.json'
cookies_dict, cookies_filepath = load_cookies(cookie_json)
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
cdays = file_modification_days(cookies_filepath)
if cdays < 7:
javdb_cookies = cookies_dict
has_json = True
break
elif cdays != 9999:
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
if not has_json:
javdb_site = secrets.choice(javdb_sites)
if debug:
print(f'[!]javdb:select site {javdb_site}')
session = None
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
try:
if debug:
raise # try get_html_by_scraper() branch
res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
if not res:
raise
query_result = res.text
except:
res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper')
if not res:
raise ValueError('page not found')
query_result = res.text
if session is None:
raise ValueError('page not found')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one
urls = html.xpath('//*[contains(@class,"movie-list")]/div/a/@href')
# 记录一下欧美的ids ['Blacked','Blacked']
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
correct_url = urls[0]
else:
ids = html.xpath('//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
try:
correct_url = urls[ids.index(number)]
except:
# 为避免获得错误番号,只要精确对应的结果
if ids[0].upper() != number:
raise ValueError("number not found")
correct_url = urls[0]
try:
# get faster benefit from http keep-alive
javdb_detail_url = urljoin(res.url, correct_url)
detail_page = session.get(javdb_detail_url).text
except:
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
session = None
# etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用
lx = etree.fromstring(detail_page, etree.HTMLParser())
imagecut = 1
dp_number = getNum(lx)
if dp_number.upper() != number.upper():
raise ValueError("number not eq"+dp_number)
title = getTitle(lx)
if title and dp_number:
number = dp_number
# remove duplicate title
title = title.replace(number, '').strip()
dic = {
'actor': getActor(lx),
'title': title,
'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(detail_page),
'number': number,
'cover': getCover(lx),
'trailer': getTrailer(detail_page),
'extrafanart': getExtrafanart(lx),
'imagecut': imagecut,
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
'website': urljoin('https://javdb.com', correct_url),
'source': 'javdb.py',
'series': getSeries(lx),
'无码': getUncensored(lx)
}
userrating = getUserRating(lx)
if isinstance(userrating, tuple) and len(userrating) == 2:
dic['用户评分'] = userrating[0]
dic['评分人数'] = userrating[1]
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人')
if not dic['series']:
dic['series'] = dic['studio']
if not dic['label']:
dic['label'] = dic['studio']
if config.getInstance().download_actor_photo_for_kodi():
dic['actor_photo'] = getActorPhoto(lx, javdb_site, session)
except Exception as e:
if debug:
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.getInstance().set_override("storyline:switch=0")
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
# print(main('blacked.20.05.30'))
print(main('AGAV-042'))
print(main('BANK-022'))
print(main('070116-197'))
print(main('093021_539')) # 没有剧照 片商pacopacomama
#print(main('FC2-2278260'))
# print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found
print(main('MVSD-439'))
# print(main('EHM0001')) # not found
#print(main('FC2-2314275'))
print(main('EBOD-646'))
print(main('LOVE-262'))
print(main('ABP-890'))
print(main('blacked.14.12.08'))

View File

@@ -1,161 +0,0 @@
import sys
sys.path.append('../')
import json
import bs4
import re
from WebCrawler import airav
from bs4 import BeautifulSoup
from lxml import html
from http.cookies import SimpleCookie
from ADC_function import get_javlib_cookie, get_html
def main(number: str):
raw_cookies, user_agent = get_javlib_cookie()
# Blank cookies mean javlib site return error
if not raw_cookies:
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
# Manually construct a dictionary
s_cookie = SimpleCookie()
s_cookie.load(raw_cookies)
cookies = {}
for key, morsel in s_cookie.items():
cookies[key] = morsel.value
# Scraping
result = get_html(
"http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
cookies=cookies,
ua=user_agent,
return_type="object"
)
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
fanhao_pather = re.compile(r'<a href=".*?".*?><div class="id">(.*?)</div>')
fanhao = fanhao_pather.findall(result.text)
if "/?v=jav" in result.url:
dic = {
"title": get_title(lx, soup),
"studio": get_table_el_single_anchor(soup, "video_maker"),
"year": get_table_el_td(soup, "video_date")[:4],
"outline": get_outline(number),
"director": get_table_el_single_anchor(soup, "video_director"),
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "javlib.py",
"actor": get_table_el_multi_anchor(soup, "video_cast"),
"label": get_table_el_td(soup, "video_label"),
"tag": get_table_el_multi_anchor(soup, "video_genres"),
"number": get_table_el_td(soup, "video_id"),
"release": get_table_el_td(soup, "video_date"),
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
"series":'',
}
elif number.upper() in fanhao:
url_pather = re.compile(r'<a href="(.*?)".*?><div class="id">(.*?)</div>')
s = {}
url_list = url_pather.findall(result.text)
for url in url_list:
s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.')
av_url = s[number.upper()]
result = get_html(
av_url,
cookies=cookies,
ua=user_agent,
return_type="object"
)
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
dic = {
"title": get_title(lx, soup),
"studio": get_table_el_single_anchor(soup, "video_maker"),
"year": get_table_el_td(soup, "video_date")[:4],
"outline": get_outline(number),
"director": get_table_el_single_anchor(soup, "video_director"),
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "javlib.py",
"actor": get_table_el_multi_anchor(soup, "video_cast"),
"label": get_table_el_td(soup, "video_label"),
"tag": get_table_el_multi_anchor(soup, "video_genres"),
"number": get_table_el_td(soup, "video_id"),
"release": get_table_el_td(soup, "video_date"),
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
"series": '',
}
else:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
return lx.xpath(xpath)[0].strip()
def get_outline(number):
try:
response = json.loads(airav.main(number))
result = response['outline']
return result
except:
return ''
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
tag = soup.find(id=tag_id).find("a")
if tag is not None:
return tag.string.strip()
else:
return ""
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
tags = soup.find(id=tag_id).find_all("a")
return process(tags)
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
tags = soup.find(id=tag_id).find_all("td", class_="text")
return process(tags)
def process(tags: bs4.element.ResultSet) -> str:
values = []
for tag in tags:
value = tag.string
if value is not None and value != "----":
values.append(value)
return ",".join(x for x in values if x)
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
number = get_table_el_td(soup, "video_id")
return title.replace(number, "").strip()
def get_cover(lx: html.HtmlComment) -> str:
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
if __name__ == "__main__":
lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"]
#lists = ["DVMC-003"]
for num in lists:
print(main(num))

View File

@@ -1,173 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
import json
import re
from lib2to3.pgen2 import parse
from urllib.parse import urlparse, unquote
def getActorPhoto(html):
return ''
def getTitle(html): # 获取标题
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
# <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
# <title>TM0002-我的痴女女友-麻豆社</title>
browser_title = str(html.xpath("/html/head/title/text()")[0])
title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getStudio(html): # 获取厂商 已修改
try:
category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
return category.strip()
except:
return '麻豆社'
def getYear(html): # 获取年份
return ''
def getCover(htmlcode): # 获取封面图片
try:
url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0])
return url.strip()
except:
return ''
def getRelease(html): # 获取出版日期
return ''
def getRuntime(html): # 获取播放时长
return ''
def getUrl(html):
return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
def getNum(url, number): # 获取番号
try:
# 解码url
filename = unquote(urlparse(url).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
# 移除中文
if result.upper() != number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
# 移除多余的符号
return result.strip('-')
except:
return ''
def getDirector(html): # 获取导演 已修改
return ''
def getOutline(html): # 获取概述
return ''
def getSerise(html): # 获取系列 已修改
return ''
def getTag(html, studio): # 获取标签
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
def getExtrafanart(html): # 获取剧照
return ''
def cutTags(tags):
actors = []
tags = []
for tag in tags:
actors.append(tag)
return actors,tags
def main(number):
try:
try:
number = number.lower().strip()
url = "https://madou.club/" + number + ".html"
htmlcode = get_html(url)
except:
# print(number)
pass
html = etree.fromstring(htmlcode, etree.HTMLParser())
url = getUrl(html)
studio = getStudio(html)
tags = getTag(html, studio)
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定放弃尝试获取
actor = ''
dic = {
# 标题
'title': getTitle(html),
# 制作商
'studio': studio,
# 年份
'year': getYear(html),
# 简介
'outline': getOutline(html),
#
'runtime': getRuntime(html),
# 导演
'director': getDirector(html),
# 演员
'actor': actor,
# 发售日
'release': getRelease(html),
# 番号
'number': getNum(url, number),
# 封面链接
'cover': getCover(htmlcode),
# 剧照获取
'extrafanart': getExtrafanart(html),
'imagecut': 1,
#
'tag': tags,
#
'label': getSerise(html),
# 作者图片
'website': url,
'source': 'madou.py',
# 使用
'series': getSerise(html),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
config.getInstance().set_override("debug_mode:switch=1")
print(main('MD0129'))
# print(main('TM0002'))
# print(main('MD0222'))
# print(main('MD0140-2'))
# print(main('MAD039'))
# print(main('JDMY027'))

View File

@@ -1,68 +0,0 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup
from ADC_function import *
from WebCrawler.crawler import *
class MgsCrawler(Crawler):
def getMgsString(self, _xpath):
html = self.html
result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result
def getExtrafanart(htmlcode2): # 获取剧照
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
html = html_pather.search(htmlcode2)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def main(number2):
number=number2.upper()
htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode2, 'lxml')
a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode = MgsCrawler(htmlcode2)
a = MgsCrawler(a2)
b = MgsCrawler(b2)
#print(b)
dic = {
'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
'imagecut': 1,
'tag': getTag(a2),
'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
'extrafanart': getExtrafanart(htmlcode2),
'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
# str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py',
'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
print(main('SIRO-4149'))

View File

@@ -1,154 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
host = 'https://www.91mv.org'
def getActorPhoto(html):
return ''
def getTitle(html): #获取标题
try:
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
return result.strip()
except:
return ''
def getStudio(html): #获取厂商 已修改
return '91制片厂'
def getYear(html): #获取年份
try:
result = str(html.xpath('//p[@class="date"]/text()')[0])
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
return ''
return ''
def getCover(htmlcode): #获取封面图片
try:
url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
return url.strip()
except:
return ''
def getRelease(html): #获取出版日期
try:
result = str(html.xpath('//p[@class="date"]/text()')[0])
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
return ''
return ''
def getRuntime(htmlcode): #获取播放时长
return ''
def getActor(html): #获取女优
b=[]
for player in html.xpath('//p[@class="player-name"]/text()'):
player = player.replace('主演:','')
b.append(player)
return b
def getNum(html): #获取番号
try:
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
return result.strip()
except:
return ''
def getDirector(html): #获取导演 已修改
return ''
def getOutline(html): #获取概述
try:
result = str(html.xpath('//div[@class="play-text"]/text()')[0])
return result.strip()
except:
return ''
def getSerise(htmlcode): #获取系列 已修改
return ''
def getTag(html): # 获取标签
return html.xpath('//div[@class="player-tag"]/text()')
def getExtrafanart(htmlcode): # 获取剧照
return ''
def search(keyword): #搜索,返回结果
search_html = get_html(host + '/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
return html.xpath('//a[@class="video-list"]/@href')[0]
def main(number):
try:
try:
number = number.replace('91CM-','').replace('91MS-','')
url = host + str(search(number))
htmlcode = get_html(url)
except:
# print(number)
pass
html = etree.fromstring(htmlcode, etree.HTMLParser())
dic = {
# 标题
'title': getTitle(html),
# 制作商
'studio': getStudio(html),
# 年份
'year': getYear(html),
# 简介
'outline': getOutline(html),
#
'runtime': getRuntime(html),
# 导演
'director': getDirector(html),
# 演员
'actor': getActor(html),
# 发售日
'release': getRelease(html),
# 番号
'number': getNum(html),
# 封面链接
'cover': getCover(htmlcode),
# 剧照获取
'extrafanart': getExtrafanart(html),
'imagecut': 1,
#
'tag': getTag(html),
#
'label': getSerise(html),
# 作者图片
'website': url,
'source': 'mv91.py',
# 使用
'series': getSerise(html)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
print(main('91CM-121'))
print(main('91CM-122'))
print(main('91CM-143'))
print(main('91MS-006'))

View File

@@ -1,220 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.storyline import getStoryline
def getTitle(html):
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
return result
def getActor(browser):
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = []
for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(browser):
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = {i.text.strip(): i['href'] for i in htmla}
o = {}
for k, v in t.items():
r = browser.open_relative(v)
if not r.ok:
continue
pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
if 'noimage.gif' in pic['src']:
continue
o[k] = urljoin(browser.url, pic['src'])
return o
def getStudio(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
except:
result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
return result.strip('+').replace("', '", '').replace('"', '')
def getRuntime(html):
try:
x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
return x
except:
return ''
def getLabel(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
return result
except:
return ''
def getNum(html):
try:
result = html.xpath('//*[@id="hinban"]/text()')[0]
return result
except:
return ''
def getYear(getRelease):
try:
result = str(re.search('\d{4}', getRelease).group())
return result
except:
return getRelease
def getRelease(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
except:
return ''
try:
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
except:
return ''
def getTag(html):
result = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
total = []
for i in result:
total.append(i.replace("\n","").replace("\t",""))
return total
def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
return 'https:' + result
except:
return ''
def getDirector(html):
try:
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
return result
except:
return ''
def getOutline(html, number, title):
storyline_site = config.getInstance().storyline_site().split(',')
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site, 无码=False)
if len(g):
return g
try:
x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
return x.replace(getNum(html), '')
except:
return ''
def getSeries(html):
try:
try:
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
return result
except:
result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0]
return result
except:
return ''
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for urli in extrafanart_imgs:
urli = 'https:' + urli.replace('/scene/small', '')
s.append(urli)
return s
return ''
def open_by_browser(number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def main(number):
try:
detail_page, browser = open_by_browser(number)
url = browser.url
lx = etree.fromstring(detail_page, etree.HTMLParser())
newnum = getNum(lx).upper()
number_up = number.upper()
if newnum != number_up:
if newnum == number.replace('-','').upper():
newnum = number_up
else:
raise ValueError("xcity.py: number not found")
title = getTitle(lx)
dic = {
'actor': getActor(browser),
'title': title,
'studio': getStudio(lx),
'outline': getOutline(lx, number, title),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(lx),
'number': newnum,
'cover': getCover(lx),
'cover_small': '',
'extrafanart': getExtrafanart(detail_page),
'imagecut': 1,
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()),
'website': url,
'source': 'xcity.py',
'series': getSeries(lx),
}
if config.getInstance().download_actor_photo_for_kodi():
dic['actor_photo'] = getActorPhoto(browser)
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
config.getInstance().set_override("storyline:switch=0")
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
print(main('RCTD-288'))
print(main('VNDS-2624'))
print(main('ABP-345'))

View File

@@ -14,7 +14,8 @@ from datetime import datetime
from lxml import etree
from ADC_function import *
from WebCrawler import get_data_from_json
# from WebCrawler import get_data_from_json
from scraper import get_data_from_json
from number_parser import is_uncensored
from ImageProcessing import cutImage

View File

@@ -1,45 +1,11 @@
import json
import re
from multiprocessing.pool import ThreadPool
import ADC_function
import secrets
import config
from ADC_function import translate
from lxml import etree
from pathlib import Path
# =========website========
from . import airav
from . import avsox
from . import fanza
from . import fc2
from . import jav321
from . import javbus
from . import javdb
from . import mgstage
from . import xcity
# from . import javlib
from . import dlsite
from . import carib
from . import fc2club
from . import mv91
from . import madou
from . import gcolle
from . import getchu
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
from scrapinglib.api import search
def get_data_from_json(file_number, oCC):
"""
@@ -49,116 +15,45 @@ def get_data_from_json(file_number, oCC):
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
func_mapping = {
"airav": airav.main,
"avsox": avsox.main,
"fc2": fc2.main,
"fanza": fanza.main,
"javdb": javdb.main,
"javbus": javbus.main,
"mgstage": mgstage.main,
"jav321": jav321.main,
"xcity": xcity.main,
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main,
"mv91": mv91.main,
"madou": madou.main,
"gcolle": gcolle.main,
"getchu": getchu.main,
}
conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
def insert(sources,source):
if source in sources:
sources.insert(0, sources.pop(sources.index(source)))
return sources
sources = conf.sources()
if len(sources) <= len(func_mapping):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
):
sources = insert(sources,"carib")
elif "item" in file_number or "GETCHU" in file_number.upper():
sources = insert(sources,"getchu")
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
sources = insert(sources, "getchu")
sources = insert(sources, "dlsite")
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "avsox" in sources:
sources = insert(sources,"avsox")
elif "mgstage" in sources and \
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources,"mgstage")
elif "fc2" in lo_file_number:
if "fc2" in sources:
sources = insert(sources,"fc2")
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources = insert(sources,"gcolle")
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
if "xcity" in sources:
sources = insert(sources,"xcity")
if "madou" in sources:
sources = insert(sources,"madou")
elif "madou" in sources and (
re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources = insert(sources,"madou")
# check sources in func_mapping
todel = []
for s in sources:
if not s in func_mapping:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
json_data = {}
if conf.multi_threading():
pool = ThreadPool(processes=len(conf.sources().split(',')))
# Set the priority of multi-thread crawling and join the multi-thread queue
for source in sources:
pool.apply_async(func_mapping[source], (file_number,))
# Get multi-threaded crawling response
for source in sources:
if conf.debug() == True:
print('[+]select', source)
try:
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
except:
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
# TODO 准备参数
# - 清理 ADC_function, webcrawler
proxies = None
configProxy = conf.proxy()
if configProxy.enable:
proxies = configProxy.proxies()
javdb_sites = conf.javdb_sites().split(',')
for i in javdb_sites:
javdb_sites[javdb_sites.index(i)] = "javdb" + i
javdb_sites.append("javdb")
# 不加载过期的cookiejavdb登录界面显示为7天免登录故假定cookie有效期为7天
has_json = False
for cj in javdb_sites:
javdb_site = cj
cookie_json = javdb_site + '.json'
cookies_dict, cookies_filepath = load_cookies(cookie_json)
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
cdays = file_modification_days(cookies_filepath)
if cdays < 7:
javdb_cookies = cookies_dict
has_json = True
break
pool.close()
pool.terminate()
else:
for source in sources:
try:
if conf.debug() == True:
print('[+]select', source)
try:
json_data = json.loads(func_mapping[source](file_number))
except:
json_data = func_mapping[source](file_number)
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
except:
continue
elif cdays != 9999:
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
if not has_json:
javdb_site = secrets.choice(javdb_sites)
javdb_cookies = None
cacert =None
if conf.cacert_file():
cacert = conf.cacert_file()
json_data = search(file_number, sources, proxies=proxies, verify=cacert,
dbsite=javdb_site, dbcookies=javdb_cookies,
morestoryline=conf.is_storyline())
# Return if data not found in all sources
if not json_data:
print('[-]Movie Number not found!')
@@ -316,26 +211,26 @@ def get_data_from_json(file_number, oCC):
try:
if ccm == 1:
json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
except:
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
else:
try:
if ccm == 1:
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
except IndexError:
json_data[cc] = oCC.convert(json_data[cc])
except:

3
scrapinglib/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
from .api import search

114
scrapinglib/airav.py Normal file
View File

@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
import json
import re
from lxml import etree
from .parser import Parser
from .javbus import Javbus
class Airav(Parser):
source = 'airav'
expr_title = '/html/head/title/text()'
expr_number = '/html/head/title/text()'
expr_studio = '//a[contains(@href,"?video_factory=")]/text()'
expr_release = '//li[contains(text(),"發片日期")]/text()'
expr_outline = "string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)"
expr_actor = '//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()'
expr_cover = '//img[contains(@src,"/storage/big_pic/")]/@src'
expr_tags = '//div[@class="tagBtnMargin"]/a/text()'
expr_extrafanart = '//div[@class="mobileImgThumbnail"]/a/@href'
def search(self, number):
self.number = number
self.detailurl = 'https://cn.airav.wiki/video/' + number
engine = Javbus()
javbusinfo = engine.scrape(number, self)
if javbusinfo == 404:
self.javbus = {"title": ""}
else:
self.javbus = json.loads(javbusinfo)
self.htmlcode = self.getHtml(self.detailurl)
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
# return super().getNum(htmltree)
result = self.javbus.get('number')
if isinstance(result, str) and len(result):
return result
number = super().getNum(htmltree)
result = str(re.findall('^\[(.*?)]', number)[0])
return result
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
def getStudio(self, htmltree):
result = self.javbus.get('studio')
if isinstance(result, str) and len(result):
return result
return super().getStudio(htmltree)
def getRelease(self, htmltree):
result = self.javbus.get('release')
if isinstance(result, str) and len(result):
return result
try:
return re.search(r'\d{4}-\d{2}-\d{2}', str(super().getRelease(htmltree))).group()
except:
return ''
def getYear(self, htmltree):
result = self.javbus.get('year')
if isinstance(result, str) and len(result):
return result
release = self.getRelease(htmltree)
return str(re.findall('\d{4}', release)).strip(" ['']")
def getOutline(self, htmltree):
return self.getTreeAll(htmltree, self.expr_outline).replace('\n','').strip()
def getRuntime(self, htmltree):
result = self.javbus.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
def getDirector(self, htmltree):
result = self.javbus.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getActors(self, htmltree):
b=[]
a = super().getActors(htmltree)
for v in a:
v = v.strip()
if len(v):
b.append(v)
if len(b):
return b
result = self.javbus.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getCover(self, htmltree):
result = self.javbus.get('cover')
if isinstance(result, str) and len(result):
return result
return super().getCover(htmltree)
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getSeries(self, htmltree):
result = self.javbus.get('series')
if isinstance(result, str) and len(result):
return result
return ''

225
scrapinglib/api.py Normal file
View File

@@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
import re
import json
from .airav import Airav
from .carib import Carib
from .dlsite import Dlsite
from .fanza import Fanza
from .gcolle import Gcolle
from .getchu import Getchu
from .jav321 import Jav321
from .javdb import Javdb
from .mv91 import Mv91
from .fc2 import Fc2
from .madou import Madou
from .mgstage import Mgstage
from .javbus import Javbus
from .xcity import Xcity
from .avsox import Avsox
from .tmdb import Tmdb
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
dbcookies=None, dbsite=None, morestoryline=False):
""" 根据``番号/电影``名搜索信息
:param number: number/name depends on type
:param sources: sources string with `,` like ``avsox,javbus``
:param type: ``adult``, ``general``
"""
sc = Scraping()
return sc.search(number, sources, proxies=proxies, verify=verify, type=type,
dbcookies=dbcookies, dbsite=dbsite, morestoryline=morestoryline)
class Scraping():
"""
"""
adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2',
'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
'gcolle', 'javdb', 'getchu']
adult_func_mapping = {
'avsox': Avsox().scrape,
'javbus': Javbus().scrape,
'xcity': Xcity().scrape,
'mgstage': Mgstage().scrape,
'madou': Madou().scrape,
'fc2': Fc2().scrape,
'dlsite': Dlsite().scrape,
'jav321': Jav321().scrape,
'fanza': Fanza().scrape,
'airav': Airav().scrape,
'carib': Carib().scrape,
'mv91': Mv91().scrape,
'gcolle': Gcolle().scrape,
'javdb': Javdb().scrape,
'getchu': Getchu().scrape,
}
general_full_sources = ['tmdb']
general_func_mapping = {
'tmdb': Tmdb().scrape,
}
proxies = None
verify = None
dbcookies = None
dbsite = None
# 使用storyline方法进一步获取故事情节
morestoryline = False
def search(self, number, sources=None, proxies=None, verify=None, type='adult',
dbcookies=None, dbsite=None, morestoryline=False):
self.proxies = proxies
self.verify = verify
self.dbcookies = dbcookies
self.dbsite = dbsite
self.morestoryline = morestoryline
if type == 'adult':
return self.searchAdult(number, sources)
else:
return self.searchGeneral(number, sources)
def searchGeneral(self, name, sources):
""" 查询电影电视剧
imdb,tmdb
"""
sources = self.checkGeneralSources(sources, name)
json_data = {}
for source in sources:
try:
print('[+]select', source)
try:
data = self.general_func_mapping[source](name, self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
print('[!] 出错啦')
print(e)
# if any service return a valid return, break
if self.get_data_state(json_data):
print(f"[+]Find movie [{name}] metadata on website '{source}'")
break
except:
continue
# Return if data not found in all sources
if not json_data:
print(f'[-]Movie Number [{name}] not found!')
return None
return json_data
def searchAdult(self, number, sources):
sources = self.checkAdultSources(sources, number)
json_data = {}
for source in sources:
try:
print('[+]select', source)
try:
data = self.adult_func_mapping[source](number, self)
if data == 404:
continue
json_data = json.loads(data)
except Exception as e:
print('[!] 出错啦')
print(e)
# json_data = self.func_mapping[source](number, self)
# if any service return a valid return, break
if self.get_data_state(json_data):
print(f"[+]Find movie [{number}] metadata on website '{source}'")
break
except:
continue
# Return if data not found in all sources
if not json_data:
print(f'[-]Movie Number [{number}] not found!')
return None
return json_data
def checkGeneralSources(self, c_sources, name):
if not c_sources:
sources = self.general_full_sources
else:
sources = c_sources.split(',')
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.general_func_mapping:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
return sources
def checkAdultSources(self, c_sources, file_number):
if not c_sources:
sources = self.adult_full_sources
else:
sources = c_sources.split(',')
def insert(sources,source):
if source in sources:
sources.insert(0, sources.pop(sources.index(source)))
return sources
if len(sources) <= len(self.adult_func_mapping):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
):
sources = insert(sources,"carib")
elif "item" in file_number or "GETCHU" in file_number.upper():
sources = insert(sources,"getchu")
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
sources = insert(sources, "getchu")
sources = insert(sources, "dlsite")
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "avsox" in sources:
sources = insert(sources,"avsox")
elif "mgstage" in sources and \
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources,"mgstage")
elif "fc2" in lo_file_number:
if "fc2" in sources:
sources = insert(sources,"fc2")
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources = insert(sources,"gcolle")
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
if "xcity" in sources:
sources = insert(sources,"xcity")
if "madou" in sources:
sources = insert(sources,"madou")
elif "madou" in sources and (
re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources = insert(sources,"madou")
# check sources in func_mapping
todel = []
for s in sources:
if not s in self.adult_func_mapping:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
return sources
def get_data_state(self, data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True

80
scrapinglib/avsox.py Normal file
View File

@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
import re
from .parser import Parser
class Avsox(Parser):
source = 'avsox'
imagecut = 3
expr_number = '//span[contains(text(),"识别码:")]/../span[2]/text()'
expr_actor = '//a[@class="avatar-box"]'
expr_actorphoto = '//a[@class="avatar-box"]'
expr_title = '/html/body/div[2]/h3/text()'
expr_studio = '//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()'
expr_release = '//span[contains(text(),"发行时间:")]/../text()'
expr_cover = '/html/body/div[2]/div[1]/div[1]/a/img/@src'
expr_smallcover = '//*[@id="waterfall"]/div/a/div[1]/img/@src'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_label = '//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'
expr_series = '//span[contains(text(),"系列:")]/../span[2]/text()'
def queryNumberUrl(self, number):
qurySiteTree = self.getHtmlTree('https://tellme.pw/avsox')
site = self.getTreeElement(qurySiteTree, '//div[@class="container"]/div/a/@href')
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number)
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('-', '_'))
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
self.searchtree = self.getHtmlTree(site + '/cn/search/' + number.replace('_', ''))
result1 = self.getTreeElement(self.searchtree, '//*[@id="waterfall"]/div/a/@href')
return "https:" + result1
def getNum(self, htmltree):
new_number = self.getTreeElement(htmltree, self.expr_number)
if new_number.upper() != self.number.upper():
raise ValueError('number not found in ' + self.source)
self.number = new_number
return new_number
def getTitle(self, htmltree):
return super().getTitle(htmltree).replace('/', '').strip(self.number)
def getStudio(self, htmltree):
return super().getStudio(htmltree).replace("', '", ' ')
def getSmallCover(self, htmltree):
""" 使用搜索页面的预览小图
"""
return self.getTreeElement(self.searchtree, self.expr_smallcover)
def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',')
return [i.strip() for i in tags[2:]] if len(tags) > 2 else []
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number)
return ''
def getActors(self, htmltree):
a = super().getActors(htmltree)
d = []
for i in a:
d.append(i.find('span').text)
return d
def getActorPhoto(self, htmltree):
a = super().getActorPhoto(htmltree)
d = {}
for i in a:
l = i.find('.//img').attrib['src']
t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d

99
scrapinglib/carib.py Normal file
View File

@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
import re
from urllib.parse import urljoin
from lxml import html
from .parser import Parser
class Carib(Parser):
source = 'carib'
uncensored = True
expr_title = "//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()"
expr_release = "//li[2]/span[@class='spec-content']/text()"
expr_runtime = "//span[@class='spec-content']/span[@itemprop='duration']/text()"
expr_actor = "//span[@class='spec-content']/a[@itemprop='actor']/span/text()"
expr_tags = "//span[@class='spec-content']/a[@itemprop='genre']/text()"
expr_extrafanart = "//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href"
expr_label = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_series = "//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()"
expr_outline = "//div[@class='movie-info section']/p[@itemprop='description']/text()"
def search(self, number):
self.number = number
self.detailurl = f'https://www.caribbeancom.com/moviepages/{number}/index.html'
htmlcode = self.getHtml(self.detailurl)
if htmlcode == 404 or 'class="movie-info section"' not in htmlcode:
return 404
htmltree = html.fromstring(htmlcode)
result = self.dictformat(htmltree)
return result
def getStudio(self, htmltree):
return '加勒比'
def getActors(self, htmltree):
r = []
actors = super().getActors(htmltree)
for act in actors:
if str(act) != '':
r.append(act)
return r
def getNum(self, htmltree):
return self.number
def getCover(self, htmltree):
return f'https://www.caribbeancom.com/moviepages/{self.number}/images/l_l.jpg'
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree):
r = []
genres = self.getTreeAll(htmltree, self.expr_extrafanart)
for g in genres:
jpg = str(g)
if '/member/' in jpg:
break
else:
r.append('https://www.caribbeancom.com' + jpg)
return r
def getActorPhoto(self, htmltree):
# return super().getActorPhoto(htmltree)
htmla = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
names = htmltree.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
t = {}
for name, a in zip(names, htmla):
if name.strip() == '':
continue
p = {name.strip(): a.attrib['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = self.getHtml(urljoin('https://www.caribbeancom.com', v), type='object')
if not r.ok:
continue
html = r.text
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
p = {k: urljoin(r.url, cssBGjpgs[0])}
o.update(p)
return o
def getOutline(self, htmltree):
from .storyline import getStoryline
result = getStoryline(self.number, uncensored=self.uncensored)
if len(result):
return result
return super().getOutline(htmltree)

97
scrapinglib/dlsite.py Normal file
View File

@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Dlsite(Parser):
source = 'dlsite'
imagecut = 4
allow_number_change = True
expr_title = '/html/head/title/text()'
expr_actor = '//th[contains(text(),"声优")]/../td/a/text()'
expr_studio = '//th[contains(text(),"商标名")]/../td/span[1]/a/text()'
expr_studio2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
expr_outline = '//*[@class="work_parts_area"]/p/text()'
expr_series = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()'
expr_series2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_director = '//th[contains(text(),"剧情")]/../td/a/text()'
expr_release = '//th[contains(text(),"贩卖日")]/../td/a/text()'
expr_cover = '//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset'
expr_tags = '//th[contains(text(),"分类")]/../td/div/a/text()'
expr_label = '//th[contains(text(),"系列名")]/../td/span[1]/a/text()'
expr_label2 = '//th[contains(text(),"社团名")]/../td/span[1]/a/text()'
expr_extrafanart = '//*[@id="work_left"]/div/div/div[1]/div/@data-src'
def search(self, number):
self.cookies = {'locale': 'zh-cn'}
if "RJ" in number or "VJ" in number:
self.number = number.upper()
self.detailurl = 'https://www.dlsite.com/maniax/work/=/product_id/' + self.number + '.html/?locale=zh_CN'
htmltree = self.getHtmlTree(self.detailurl)
else:
self.detailurl = f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie'
htmltree = self.getHtmlTree(self.detailurl)
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
if "" in number:
number = number.replace("","")
elif "" in number:
number = number.replace("","")
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
htmltree = self.getHtmlTree(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie')
search_result = self.getTreeAll(htmltree, '//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
self.detailurl = search_result[0]
htmltree = self.getHtmlTree(self.detailurl)
self.number = str(re.findall("\wJ\w+", self.detailurl)).strip(" [']")
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return self.number
def getTitle(self, htmltree):
result = super().getTitle(htmltree)
result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
if 'OFF】' in result:
result = result[result.find('')+1:]
result = result.replace('【HD版】', '')
return result
def getOutline(self, htmltree):
total = []
result = self.getTreeAll(htmltree, self.expr_outline)
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getRelease(self, htmltree):
return super().getRelease(htmltree).replace('','-').replace('','-').replace('','')
def getCover(self, htmltree):
return 'https:' + super().getCover(htmltree).replace('.webp', '.jpg')
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree):
try:
result = []
for i in self.getTreeAll(self.expr_extrafanart):
result.append("https:" + i)
except:
result = ''
return result

130
scrapinglib/fanza.py Normal file
View File

@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urlencode
from .parser import Parser
class Fanza(Parser):
source = 'fanza'
expr_title = '//*[starts-with(@id, "title")]/text()'
expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
expr_cover = '//head/meta[@property="og:image"]'
expr_extrafanart = '//a[@name="sample-image"]/img/@src'
expr_outline = "//div[@class='mg-b20 lh4']/text()"
expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
expr_outline_og = '//head/meta[@property="og:description"]'
expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
def search(self, number):
self.number = number
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
]
for url in fanza_urls:
self.detailurl = url + fanza_search_number
url = "https://www.dmm.co.jp/age_check/=/declared=yes/?"+ urlencode({"rurl": self.detailurl})
self.htmlcode = self.getHtml(url)
if self.htmlcode != 404:
self.htmltree = etree.HTML(self.htmlcode)
break
if self.htmlcode == 404:
return 404
result = self.dictformat(self.htmltree)
return result
def getNum(self, htmltree):
# for some old page, the input number does not match the page
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
self.fanza_hinban = self.getFanzaString('品番:')
self.number = self.fanza_hinban
number_lo = self.number.lower()
if (re.sub('-|_', '', number_lo) == self.fanza_hinban or
number_lo.replace('-', '00') == self.fanza_hinban or
number_lo.replace('-', '') + 'so' == self.fanza_hinban
):
self.number = self.number
return self.number
def getStudio(self, htmltree):
return self.getFanzaString('メーカー')
def getOutline(self, htmltree):
try:
result = self.getTreeElement(htmltree, self.expr_outline).replace("\n", "")
if result == '':
result = self.getTreeElement(htmltree, self.expr_outline2).replace("\n", "")
if "※ 配信方法によって収録内容が異なる場合があります。" == result:
result = self.getTreeElement(htmltree, self.expr_outline_og).get('content')
return result
except:
return ''
def getRuntime(self, htmltree):
return str(re.search(r'\d+', super().getRuntime(htmltree)).group()).strip(" ['']")
def getDirector(self, htmltree):
if "anime" not in self.detailurl:
return self.getFanzaString('監督:')
return ''
def getActors(self, htmltree):
if "anime" not in self.detailurl:
return super().getActors(htmltree)
return ''
def getRelease(self, htmltree):
result = self.getFanzaString('発売日:')
if result == '' or result == '----':
result = self.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getCover(self, htmltree):
return self.getTreeElement(htmltree, './/head/meta[@property="og:image"]').get('content')
def getTags(self, htmltree):
return self.getFanzaStrings('ジャンル:')
def getLabel(self, htmltree):
ret = self.getFanzaStrings('レーベル')
if ret == "----":
return ''
return ret
def getSeries(self, htmltree):
ret = self.getFanzaStrings('シリーズ:')
if ret == "----":
return ''
return ret
def getFanzaString(self, expr):
result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getFanzaStrings(self, string):
result1 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
if len(result1) > 0:
return result1
result2 = self.htmltree.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2

71
scrapinglib/fc2.py Normal file
View File

@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urljoin
from .parser import Parser
class Fc2(Parser):
source = 'fc2'
imagecut = 0
expr_title = '/html/head/title/text()'
expr_studio = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
expr_release = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()'
expr_runtime = "//p[@class='items_article_info']/text()"
expr_director = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
expr_actor = '//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'
expr_cover = "//div[@class='items_article_MainitemThumb']/span/img/@src"
expr_tags = "//a[@class='tag tagTag']/text()"
def search(self, number):
self.number = number.replace('FC2-', '').replace('fc2-', '')
self.detailurl = 'https://adult.contents.fc2.com/article/' + self.number + '/'
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.HTML(self.htmlcode)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return 'FC2-' + self.number
def getRelease(self, htmltree):
return super().getRelease(htmltree).strip(" ['販売日 : ']").replace('/','-')
def getActors(self, htmltree):
actors = super().getActors(htmltree)
if not actors:
actors = '素人'
return actors
def getCover(self, htmltree):
return urljoin('https://adult.contents.fc2.com', super().getCover(htmltree))
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTrailer(self, htmltree):
video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'')
video = video_pather.findall(self.htmlcode)
if video:
try:
video_url = video[0].replace('\'', '')
video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + self.number + '/sample?key=' + video_url
url_json = eval(self.getHtml(video_url))['path'].replace('\\', '')
return url_json
except:
return ''
else:
return ''

73
scrapinglib/gcolle.py Normal file
View File

@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .httprequest import get_html_session
from .parser import Parser
class Gcolle(Parser):
source = 'gcolle'
imagecut = 4
expr_r18 = '//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href'
expr_number = '//td[contains(text(),"商品番号")]/../td[2]/text()'
expr_title = '//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()'
expr_studio = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_director = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_actor = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_label = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_series = '//td[contains(text(),"アップロード会員名")]/b/text()'
expr_release = '//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'
expr_cover = '//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'
expr_tags = '//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'
expr_outline = '//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'
expr_extrafanart = '//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src'
expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
def search(self, number):
self.number = number.upper().replace('GCOLLE-','')
self.detailurl = 'https://gcolle.net/product_info.php/products_id/' + self.number
session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
htmltree = etree.HTML(htmlcode)
r18url = self.getTreeElement(htmltree, self.expr_r18)
if r18url and r18url.startswith('http'):
htmlcode = session.get(r18url).text
htmltree = etree.HTML(htmlcode)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
num = super().getNum(htmltree)
if self.number != num:
raise Exception(f'[!] {self.number}: find [{num}] in gcolle, not match')
return "GCOLLE-" + str(num)
def getOutline(self, htmltree):
result = self.getTreeAll(htmltree, self.expr_outline)
try:
return "\n".join(result)
except:
return ""
def getRelease(self, htmltree):
return re.findall('\d{4}-\d{2}-\d{2}', super().getRelease(htmltree))[0]
def getCover(self, htmltree):
return "https:" + super().getCover(htmltree)
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getExtrafanart(self, htmltree):
extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart)
if len(extrafanart) == 0:
extrafanart = self.getTreeAll(htmltree, self.expr_extrafanart2)
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
return extrafanart

150
scrapinglib/getchu.py Normal file
View File

@@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
import re
import json
from urllib.parse import quote
from .parser import Parser
class Getchu():
source = 'getchu'
def scrape(self, number, core: None):
dl = dlGetchu()
www = wwwGetchu()
number = number.replace("-C", "")
dic = {}
if "item" in number:
sort = ["dl.scrape(number, core)", "www.scrape(number, core)"]
else:
sort = ["www.scrape(number, core)", "dl.scrape(number, core)"]
for i in sort:
try:
dic = eval(i)
if dic != None and json.loads(dic).get('title') != '':
break
except:
pass
return dic
class wwwGetchu(Parser):
imagecut = 0
allow_number_change = True
cookies = {'getchu_adalt_flag': 'getchu.com', "adult_check_flag": "1"}
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
expr_title = '//*[@id="soft-title"]/text()'
expr_cover = "/html/body/div[1]/table[2]/tr[1]/td/a/@href"
expr_director = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_studio = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_actor = "//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"
expr_label = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
expr_release = "//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
expr_tags = "//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"
expr_outline = "//div[contains(text(),'商品紹介')]/following-sibling::div/text()"
expr_extrafanart = "//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"
expr_series = "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
def queryNumberUrl(self, number):
self.number = quote(number, encoding="euc_jp")
queryUrl = self.GETCHU_WWW_SEARCH_URL.replace("_WORD_", self.number)
# NOTE dont know why will try 2 times
retry = 2
for i in range(retry):
queryTree = self.getHtmlTree(queryUrl)
detailurl = self.getTreeElement(queryTree, '//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if detailurl:
break
if detailurl == "":
return None
return detailurl.replace('../', 'http://www.getchu.com/')
def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.detailurl.replace("http://www.getchu.com/soft.phtml?id=", ""))[0]
def getCover(self, htmltree):
return "http://www.getchu.com" + super().getCover(htmltree).replace("./", '/')
def getActors(self, htmltree):
return super().getDirector(htmltree)
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getOutline(self, htmltree):
outline = ''
_list = self.getTreeAll(htmltree, self.expr_outline)
for i in _list:
outline = outline + i.strip()
return outline
def getExtrafanart(self, htmltree):
arts = super().getExtrafanart(htmltree)
extrafanart = []
for i in arts:
i = "http://www.getchu.com" + i.replace("./", '/')
if 'jpg' in i:
extrafanart.append(i)
return extrafanart
def extradict(self, dic: dict):
""" 额外新增的 headers
"""
dic['headers'] = {'referer': self.detailurl}
return dic
class dlGetchu(wwwGetchu):
""" 二者基本一致
headers extrafanart 略有区别
"""
imagecut = 4
allow_number_change = True
cookies = {"adult_check_flag": "1"}
extraheader = {"Referer": "https://dl.getchu.com/"}
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
expr_title = "//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"
expr_cover = "//td[contains(@bgcolor,'#ffffff')]/img/@src"
expr_director = "//td[contains(text(),'作者')]/following-sibling::td/text()"
expr_studio = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
expr_label = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
expr_runtime = "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()"
expr_release = "//td[contains(text(),'配信開始日')]/following-sibling::td/text()"
expr_tags = "//td[contains(text(),'趣向')]/following-sibling::td/a/text()"
expr_outline = "//*[contains(text(),'作品内容')]/following-sibling::td/text()"
expr_extrafanart = "//td[contains(@style,'background-color: #444444;')]/a/@href"
expr_series = "//td[contains(text(),'サークル')]/following-sibling::td/a/text()"
def queryNumberUrl(self, number):
if "item" in number or 'GETCHU' in number.upper():
self.number = re.findall('\d+',number)[0]
else:
queryUrl = self.GETCHU_DL_SEARCH_URL.replace("_WORD_", number)
queryTree = self.getHtmlTree(queryUrl)
detailurl = self.getTreeElement(queryTree, '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
if detailurl == "":
return None
self.number = re.findall('\d+', detailurl)[0]
return self.GETCHU_DL_URL.replace("_WORD_", self.number)
def getNum(self, htmltree):
return 'GETCHU-' + re.findall('\d+', self.number)[0]
def getCover(self, htmltree):
return "https://dl.getchu.com" + super().getCover(htmltree)
def extradict(self, dic: dict):
return dic
def getExtrafanart(self, htmltree):
arts = self.getTreeAll(htmltree, self.expr_extrafanart)
extrafanart = []
for i in arts:
i = "https://dl.getchu.com" + i
extrafanart.append(i)
return extrafanart

250
scrapinglib/httprequest.py Normal file
View File

@@ -0,0 +1,250 @@
# -*- coding: utf-8 -*-
import mechanicalsoup
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from cloudscraper import create_scraper
G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.133 Safari/537.36'
G_DEFAULT_TIMEOUT = 10
def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
网页请求核心函数
是否使用代理应由上层处理
"""
errors = ""
headers = {"User-Agent": ua or G_USER_AGENT}
if extra_headers != None:
headers.update(extra_headers)
for i in range(retry):
try:
result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies,
verify=verify, cookies=cookies)
if return_type == "object":
return result
elif return_type == "content":
return result.content
else:
result.encoding = encoding or result.apparent_encoding
return result.text
except Exception as e:
print(f"[-]Connect: {url} retry {i + 1}/{retry}")
errors = str(e)
if "getaddrinfo failed" in errors:
print("[-]Connect Failed! Please Check your proxy config")
print("[-]" + errors)
else:
print("[-]" + errors)
print('[-]Connect Failed! Please check your Proxy or Network!')
raise Exception('Connect Failed')
def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
"""
是否使用代理应由上层处理
"""
errors = ""
headers = {"User-Agent": ua or G_USER_AGENT}
for i in range(retry):
try:
result = requests.post(url, data=data, files=files, headers=headers, timeout=timeout, proxies=proxies,
verify=verify, cookies=cookies)
if return_type == "object":
return result
elif return_type == "content":
return result.content
else:
result.encoding = encoding or result.apparent_encoding
return result
except Exception as e:
print(f"[-]Connect: {url} retry {i + 1}/{retry}")
errors = str(e)
if "getaddrinfo failed" in errors:
print("[-]Connect Failed! Please Check your proxy config")
print("[-]" + errors)
else:
print("[-]" + errors)
print('[-]Connect Failed! Please check your Proxy or Network!')
raise Exception('Connect Failed')
#
# TODO: 以下临时使用,更新完各站后,再更新
#
class TimeoutHTTPAdapter(HTTPAdapter):
def __init__(self, *args, **kwargs):
self.timeout = G_DEFAULT_TIMEOUT
if "timeout" in kwargs:
self.timeout = kwargs["timeout"]
del kwargs["timeout"]
super().__init__(*args, **kwargs)
def send(self, request, **kwargs):
timeout = kwargs.get("timeout")
if timeout is None:
kwargs["timeout"] = self.timeout
return super().send(request, **kwargs)
# with keep-alive feature
# storyline carib gcolle javdb only
def get_html_session(url: str = None, cookies = None, ua: str = None, return_type: str = None,
encoding: str = None, retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = requests.Session()
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
session.headers = {"User-Agent": ua or G_USER_AGENT}
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用session对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "session":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_session() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_session() failed. {e}")
return None
# storyline only
# 使用 cloudscraper....
def get_html_by_browser(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, use_scraper: bool = False,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = create_scraper(browser={'custom': ua or G_USER_AGENT, }) if use_scraper else requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
if isinstance(url, str) and len(url):
result = browser.open(url)
else:
return browser
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "browser":
return result, browser
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_browser() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_browser() Failed! {e}')
return None
# storyline xcity only
def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None,
return_type: str = None, encoding: str = None,
retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
session = requests.Session()
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
browser = mechanicalsoup.StatefulBrowser(user_agent=ua or G_USER_AGENT, session=session)
result = browser.open(url)
if not result.ok:
return None
form = browser.select_form() if form_select is None else browser.select_form(form_select)
if isinstance(fields, dict):
for k, v in fields.items():
browser[k] = v
response = browser.submit_selected()
if return_type == "object":
return response
elif return_type == "content":
return response.content
elif return_type == "browser":
return response, browser
else:
result.encoding = encoding or "utf-8"
return response.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_form() Proxy error! Please check your Proxy")
except Exception as e:
print(f'[-]get_html_by_form() Failed! {e}')
return None
# storyline javdb only
def get_html_by_scraper(url: str = None, cookies: dict = None, ua: str = None, return_type: str = None,
encoding: str = None, retry: int = 3, proxies=None, timeout: int = G_DEFAULT_TIMEOUT, verify=None):
session = create_scraper(browser={'custom': ua or G_USER_AGENT, })
if isinstance(cookies, dict) and len(cookies):
requests.utils.add_dict_to_cookiejar(session.cookies, cookies)
retries = Retry(total=retry, connect=retry, backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504])
session.mount("https://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
session.mount("http://", TimeoutHTTPAdapter(max_retries=retries, timeout=timeout))
if verify:
session.verify = verify
if proxies:
session.proxies = proxies
try:
if isinstance(url, str) and len(url):
result = session.get(str(url))
else: # 空url参数直接返回可重用scraper对象无需设置return_type
return session
if not result.ok:
return None
if return_type == "object":
return result
elif return_type == "content":
return result.content
elif return_type == "scraper":
return result, session
else:
result.encoding = encoding or "utf-8"
return result.text
except requests.exceptions.ProxyError:
print("[-]get_html_by_scraper() Proxy error! Please check your Proxy")
except Exception as e:
print(f"[-]get_html_by_scraper() failed. {e}")
return None

83
scrapinglib/jav321.py Normal file
View File

@@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from . import httprequest
from .parser import Parser
class Jav321(Parser):
source = 'jav321'
expr_title = "/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()"
expr_cover = "/html/body/div[2]/div[2]/div[1]/p/a/img/@src"
expr_outline = "/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()"
# NOTE: 统一使用 xpath
expr_number = '//b[contains(text(),"品番")]/following-sibling::node()'
expr_actor = '//b[contains(text(),"出演者")]/following-sibling::a[starts-with(@href,"/star")]'
expr_label = '//b[contains(text(),"メーカー")]/following-sibling::a[starts-with(@href,"/company")]'
expr_tags = '//b[contains(text(),"ジャンル")]/following-sibling::a[starts-with(@href,"/genre")]'
expr_studio = '//b[contains(text(),"メーカー")]/following-sibling::a[starts-with(@href,"/company")]'
expr_release = '//b[contains(text(),"配信開始日")]/following-sibling::node()'
expr_runtime = '//b[contains(text(),"収録時間")]/following-sibling::node()'
# expr_series = '//b[contains(text(),"シリーズ")]'
def queryNumberUrl(self, number):
return 'https://www.jav321.com/search'
def getHtmlTree(self, url):
resp = httprequest.post(url, data={"sn": self.number}, cookies=self.cookies, proxies=self.proxies, verify=self.verify)
if "/video/" in resp.url:
self.detailurl = resp.url
self.detailhtml = resp.text
return etree.fromstring(resp.text, etree.HTMLParser())
return None
def getNum(self, htmltree):
return super().getNum(htmltree).split(": ")[1]
def getTrailer(self, htmltree):
videourl_pather = re.compile(r'<source src=\"(.*?)\"')
videourl = videourl_pather.findall(self.detailhtml)
if videourl:
url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp')
return url
else:
return ''
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">')
html = html_pather.search(self.detailhtml)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getRelease(self, htmltree):
return super().getRelease(htmltree).split(": ")[1]
def getRuntime(self, htmltree):
return super().getRuntime(htmltree).split(": ")[1]
def parseElement(self, all):
if all:
ret = []
for si in all:
ret.append(si.text)
return ",".join(ret)
return ''
def getActors(self, htmltree):
return self.parseElement(super().getActors(htmltree))
def getLabel(self, htmltree):
return self.parseElement(self.getTreeAll(htmltree, self.expr_label))
def getTags(self, htmltree):
return self.parseElement(self.getTreeAll(htmltree, self.expr_tags))
def getStudio(self, htmltree):
return self.parseElement(self.getTreeAll(htmltree, self.expr_studio))

145
scrapinglib/javbus.py Normal file
View File

@@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
import re
import os
import secrets
import inspect
from lxml import etree
from urllib.parse import urljoin
from .parser import Parser
class Javbus(Parser):
source = 'javbus'
expr_number = '/html/head/meta[@name="keywords"]/@content'
expr_title = '/html/head/title/text()'
expr_studio = '//span[contains(text(),"製作商:")]/../a/text()'
expr_studio2 = '//span[contains(text(),"メーカー:")]/../a/text()'
expr_director = '//span[contains(text(),"導演:")]/../a/text()'
expr_directorJa = '//span[contains(text(),"監督:")]/../a/text()'
expr_series = '//span[contains(text(),"系列:")]/../a/text()'
expr_series2 = '//span[contains(text(),"シリーズ:")]/../a/text()'
expr_label = '//span[contains(text(),"系列:")]/../a/text()'
expr_cover = '//a[@class="bigImage"]/@href'
expr_release = '/html/body/div[5]/div[1]/div[2]/p[2]/text()'
expr_runtime = '/html/body/div[5]/div[1]/div[2]/p[3]/text()'
expr_actor = '//div[@class="star-name"]/a'
expr_actorphoto = '//div[@class="star-name"]/../a/img'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
expr_uncensored = '//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]'
def search(self, number):
self.number = number
try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
'dmmbus.fun', 'dmmsee.fun',
'fanbus.us',
'seedmm.fun',
]) + "/"
try:
self.detailurl = url + number
self.htmlcode = self.getHtml(self.detailurl)
except:
self.detailurl = 'https://www.javbus.com/' + number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode,etree.HTMLParser())
result = self.dictformat(htmltree)
return result
except:
self.searchUncensored(number)
def searchUncensored(self, number):
""" 二次搜索无码
"""
self.imagecut = 0
self.uncensored = True
w_number = number.replace('.', '-')
self.detailurl = 'https://www.javbus.red/' + w_number
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
return super().getNum(htmltree).split(',')[0]
def getTitle(self, htmltree):
title = super().getTitle(htmltree)
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
return title
def getStudio(self, htmltree):
if self.uncensored:
return self.getTreeElement(htmltree, self.expr_studio2)
else:
return self.getTreeElement(htmltree, self.expr_studio)
def getCover(self, htmltree):
return urljoin("https://www.javbus.com", super().getCover(htmltree))
def getRelease(self, htmltree):
return super().getRelease(htmltree).strip(" ['']")
def getRuntime(self, htmltree):
return super().getRuntime(htmltree).strip(" ['']分鐘")
def getActors(self, htmltree):
actors = super().getActors(htmltree)
b=[]
for i in actors:
b.append(i.attrib['title'])
return b
def getActorPhoto(self, htmltree):
actors = super().getActorPhoto(htmltree)
d = {}
for i in actors:
p = i.attrib['src']
if "nowprinting.gif" in p:
continue
t = i.attrib['title']
d[t] = urljoin("https://www.javbus.com", p)
return d
def getDirector(self, htmltree):
if self.uncensored:
return self.getTreeElement(htmltree, self.expr_directorJa)
else:
return self.getTreeElement(htmltree, self.expr_director)
def getSeries(self, htmltree):
if self.uncensored:
return self.getTreeElement(htmltree, self.expr_series2)
else:
return self.getTreeElement(htmltree, self.expr_series)
def getTags(self, htmltree):
tags = super().getTags(htmltree).split(',')
return tags[1:]
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(self.htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getOutline(self, htmltree):
if self.morestoryline:
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
from .storyline import getStoryline
return getStoryline(self.number , uncensored = self.uncensored)
return ''

260
scrapinglib/javdb.py Normal file
View File

@@ -0,0 +1,260 @@
# -*- coding: utf-8 -*-
import re
from urllib.parse import urljoin
from lxml import etree
from .httprequest import get_html_session
from .parser import Parser
class Javdb(Parser):
source = 'javdb'
fixstudio = False
noauth = False
expr_number = '//strong[contains(text(),"番號")]/../span/text()'
expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()'
expr_title = "/html/head/title/text()"
expr_title_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/text()'
expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
expr_uncensored = '//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?") or contains(@href,"/tags/western?")]'
expr_actor = '//span[@class="value"]/a[contains(@href,"/actors/")]/text()'
expr_actor2 = '//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class'
expr_release = '//strong[contains(text(),"日期")]/../span/text()'
expr_release_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "meta")]/text()'
expr_studio = '//strong[contains(text(),"片商")]/../span/a/text()'
expr_studio2 = '//strong[contains(text(),"賣家:")]/../span/a/text()'
expr_director = '//strong[contains(text(),"導演")]/../span/text()'
expr_director2 = '//strong[contains(text(),"導演")]/../span/a/text()'
expr_cover = "//div[contains(@class, 'column-video-cover')]/a/img/@src"
expr_cover2 = "//div[contains(@class, 'column-video-cover')]/img/@src"
expr_cover_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "cover")]/img/@src'
expr_extrafanart = "//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href"
expr_tags = '//strong[contains(text(),"類別")]/../span/a/text()'
expr_tags2 = '//strong[contains(text(),"類別")]/../span/text()'
expr_series = '//strong[contains(text(),"系列")]/../span/text()'
expr_series2 = '//strong[contains(text(),"系列")]/../span/a/text()'
expr_label = '//strong[contains(text(),"系列")]/../span/text()'
expr_label2 = '//strong[contains(text(),"系列")]/../span/a/text()'
expr_userrating = '//span[@class="score-stars"]/../text()'
expr_uservotes = '//span[@class="score-stars"]/../text()'
expr_actorphoto = '//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]'
def updateCore(self, core):
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
# special
if core.dbcookies:
self.cookies = core.dbcookies
else:
self.cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
if core.dbsite:
self.dbsite = core.dbsite
else:
self.dbsite = 'javdb'
def search(self, number: str):
self.number = number
self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
self.detailurl = self.queryNumberUrl(number)
self.deatilpage = self.session.get(self.detailurl).text
if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
self.noauth = True
self.imagecut = 0
result = self.dictformat(self.querytree)
else:
htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser())
result = self.dictformat(htmltree)
return result
def queryNumberUrl(self, number):
javdb_url = 'https://' + self.dbsite + '.com/search?q=' + number + '&f=all'
try:
resp = self.session.get(javdb_url)
except Exception as e:
print(e)
raise Exception(f'[!] {self.number}: page not fond in javdb')
self.querytree = etree.fromstring(resp.text, etree.HTMLParser())
# javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one
urls = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href')
# 记录一下欧美的ids ['Blacked','Blacked']
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
correct_url = urls[0]
else:
ids = self.getTreeAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
try:
self.queryid = ids.index(number)
correct_url = urls[self.queryid]
except:
# 为避免获得错误番号,只要精确对应的结果
if ids[0].upper() != number:
raise ValueError("number not found in javdb")
correct_url = urls[0]
return urljoin(resp.url, correct_url)
def getNum(self, htmltree):
if self.noauth:
return self.number
result1 = str(self.getTreeAll(htmltree, self.expr_number)).strip(" ['']")
result2 = str(self.getTreeAll(htmltree, self.expr_number2)).strip(" ['']")
dp_number = str(result2 + result1).strip('+')
# NOTE 检测匹配与更新 self.number
if dp_number.upper() != self.number.upper():
raise Exception(f'[!] {self.number}: find [{dp_number}] in javdb, not match')
self.number = dp_number
return self.number
def getTitle(self, htmltree):
if self.noauth:
return self.getTreeElement(htmltree, self.expr_title_no, self.queryid)
browser_title = super().getTitle(htmltree)
title = browser_title[:browser_title.find(' | JavDB')].strip()
return title.replace(self.number, '').strip()
def getCover(self, htmltree):
if self.noauth:
return self.getTreeElement(htmltree, self.expr_cover_no, self.queryid)
return super().getCover(htmltree)
def getRelease(self, htmltree):
if self.noauth:
return self.getTreeElement(htmltree, self.expr_release_no, self.queryid).strip()
return super().getRelease(htmltree)
def getRuntime(self, htmltree):
result1 = str(self.getTreeAll(htmltree, self.expr_runtime)).strip(" ['']")
result2 = str(self.getTreeAll(htmltree, self.expr_runtime2)).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getDirector(self, htmltree):
result1 = str(self.getTreeAll(htmltree, self.expr_director)).strip(" ['']")
result2 = str(self.getTreeAll(htmltree, self.expr_director2)).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getSeries(self, htmltree):
result1 = str(self.getTreeAll(htmltree, self.expr_series)).strip(" ['']")
result2 = str(self.getTreeAll(htmltree, self.expr_series2)).strip(" ['']")
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
if not result and self.fixstudio:
result = self.getStudio(htmltree)
return result
def getLabel(self, htmltree):
result1 = str(self.getTreeAll(htmltree, self.expr_label)).strip(" ['']")
result2 = str(self.getTreeAll(htmltree, self.expr_label2)).strip(" ['']")
result = str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
if not result and self.fixstudio:
result = self.getStudio(htmltree)
return result
def getActors(self, htmltree):
actors = self.getTreeAll(htmltree, self.expr_actor)
genders = self.getTreeAll(htmltree, self.expr_actor2)
r = []
idx = 0
# NOTE only female, we dont care others
actor_gendor = 'female'
for act in actors:
if((actor_gendor == 'all')
or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male'])
or (actor_gendor == 'female' and genders[idx] == 'symbol female')
or (actor_gendor == 'male' and genders[idx] == 'symbol male')):
r.append(act)
idx = idx + 1
if re.match(r'FC2-[\d]+', self.number, re.A) and not r:
r = '素人'
self.fixstudio = True
return r
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, self.getUncensored(htmltree))
return ''
def getStudio(self, htmltree):
try:
return self.getTreeAll(htmltree, self.expr_studio).strip(" ['']")
except:
pass
try:
return self.getTreeAll(htmltree, self.expr_studio2).strip(" ['']")
except:
return ''
def getTrailer(self, htmltree):
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
video = video_pather.findall(self.deatilpage)
# 加上数组判空
if video and video[0] != "":
if not 'https:' in video[0]:
video_url = 'https:' + video[0]
else:
video_url = video[0]
else:
video_url = ''
return video_url
def getTags(self, htmltree):
try:
return self.getTreeAll(htmltree, self.expr_tags)
except:
pass
try:
return self.getTreeAll(htmltree, self.expr_tags2)
except:
return ''
def getUserRating(self, htmltree):
try:
result = str(self.getTreeElement(htmltree, self.expr_userrating))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0])
except:
return
def getUserVotes(self, htmltree):
try:
result = str(self.getTreeElement(htmltree, self.expr_uservotes))
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return int(v[0][1])
except:
return
def getaphoto(self, url, session):
html_page = session.get(url).text
img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page)
return img_url[0] if img_url else ''
def getActorPhoto(self, htmltree):
actorall = self.getTreeAll(htmltree, self.expr_actorphoto)
if not actorall:
return {}
actors = self.getActors(htmltree)
actor_photo = {}
for i in actorall:
x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A)
if not len(x) or not len(x[0]) or i.text not in actors:
continue
# NOTE: https://c1.jdbstatic.com 会经常变动,直接使用页面内的地址获取
# actor_id = x[0]
# pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg"
# if not self.session.head(pic_url).ok:
try:
pic_url = self.getaphoto(urljoin('https://javdb.com', i.attrib['href']), self.session)
if len(pic_url):
actor_photo[i.text] = pic_url
except:
pass
return actor_photo

63
scrapinglib/madou.py Normal file
View File

@@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from urllib.parse import urlparse, unquote
from .parser import Parser
class Madou(Parser):
source = 'madou'
uncensored = True
expr_url = '//a[@class="share-weixin"]/@data-url'
expr_title = "/html/head/title/text()"
expr_studio = '//a[@rel="category tag"]/text()'
expr_tags = '/html/head/meta[@name="keywords"]/@content'
def search(self, number):
self.number = number.lower().strip()
self.detailurl = "https://madou.club/" + number + ".html"
self.htmlcode = self.getHtml(self.detailurl)
if self.htmlcode == 404:
return 404
htmltree = etree.fromstring(self.htmlcode, etree.HTMLParser())
self.detailurl = self.getTreeElement(htmltree, self.expr_url)
result = self.dictformat(htmltree)
return result
def getNum(self, htmltree):
try:
# 解码url
filename = unquote(urlparse(self.detailurl).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
# 移除中文
if result.upper() != self.number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
# 移除多余的符号
return result.strip('-')
except:
return ''
def getTitle(self, htmltree):
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
# <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
# <title>TM0002-我的痴女女友-麻豆社</title>
browser_title = str(super().getTitle(htmltree))
title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getCover(self, htmltree):
try:
url = str(re.findall("shareimage : '(.*?)'", self.htmlcode)[0])
return url.strip()
except:
return ''
def getTags(self, htmltree):
studio = self.getStudio(htmltree)
x = super().getTags(htmltree).split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]

52
scrapinglib/mgstage.py Normal file
View File

@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
from .utils import getTreeElement
from .parser import Parser
class Mgstage(Parser):
source = 'mgstage'
expr_number = '//th[contains(text(),"品番:")]/../td/a/text()'
expr_title = '//*[@id="center_column"]/div[1]/h1/text()'
expr_studio = '//th[contains(text(),"メーカー:")]/../td/a/text()'
expr_outline = '//dl[@id="introduction"]/dd/p/text()'
expr_runtime = '//th[contains(text(),"収録時間:")]/../td/a/text()'
expr_director = '//th[contains(text(),"シリーズ")]/../td/a/text()'
expr_actor = '//th[contains(text(),"出演:")]/../td/a/text()'
expr_release = '//th[contains(text(),"配信開始日:")]/../td/a/text()'
expr_cover = '//*[@id="EnlargeImage"]/@href'
expr_label = '//th[contains(text(),"シリーズ:")]/../td/a/text()'
expr_tags = '//th[contains(text(),"ジャンル:")]/../td/a/text()'
expr_tags2 = '//th[contains(text(),"ジャンル:")]/../td/text()'
expr_series = '//th[contains(text(),"シリーズ")]/../td/a/text()'
expr_extrafanart = '//a[@class="sample_image"]/@href'
def search(self, number):
self.number = number.upper()
self.cookies = {'adc':'1'}
self.detailurl = 'https://www.mgstage.com/product/product_detail/'+str(self.number)+'/'
htmltree =self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
return result
def getTitle(self, htmltree):
return super().getTitle(htmltree).replace('/', ',').strip()
def getTags(self, htmltree):
results = self.getTreeAll(htmltree, self.expr_tags)
results2 = self.getTreeAll(htmltree, self.expr_tags2)
return [ x.strip() for x in (results + results2) if x.strip()]
def getTreeAll(self, tree, expr):
alls = super().getTreeAll(tree, expr)
return [ x.strip() for x in alls ]
def getTreeElement(self, tree, expr, index=0):
if expr == '':
return ''
result1 = getTreeElement(tree, expr).strip().replace("', '", '').strip(" ['']")
result2 = getTreeElement(tree, expr.replace('td/a/','td/')).strip().replace("', '", '').strip(" ['']")
if result1 == result2:
return str(result1).strip('+').replace("', '",'').replace('"','')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')

93
scrapinglib/mv91.py Normal file
View File

@@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
import re
from lxml import etree
from .parser import Parser
class Mv91(Parser):
source = 'mv91'
expr_number = '//div[@class="player-title"]/text()'
expr_title = '//div[@class="player-title"]/text()'
expr_release = '//p[@class="date"]/text()'
expr_outline = '//div[@class="play-text"]/text()'
expr_tags = '//div[@class="player-tag"]/text()'
expr_actor = '//p[@class="player-name"]/text()'
def getHtmlTree(self, url, type=None):
self.htmlcode = self.getHtml(url, type)
if self.htmlcode == 404:
return 404
ret = etree.fromstring(self.htmlcode, etree.HTMLParser())
return ret
def queryNumberUrl(self, number):
keyword = number.replace('91CM-','').replace('91MS-','')
search_html = self.getHtml('https://www.91mv.org/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
endurl = html.xpath('//a[@class="video-list"]/@href')[0]
return 'https://www.91mv.org' + endurl
def getNum(self, htmltree):
try:
num = super().getNum(htmltree)
finds = re.findall('(.*)(91.*-\d*)',num)
if finds:
result = str(finds[0][1])
else:
result = ' '.join(num.replace('/',' ').split())
result = result.split()[1]
if self.number.upper() != result.upper():
raise Exception(f'[!] {self.number}: find {result} in mv91, not match')
return result.strip()
except:
return ''
def getTitle(self, htmltree):
try:
title = super().getTitle(htmltree)
finds = re.findall('(.*)(91.*-\d*)',title)
if finds:
result = str(finds[0][0])
else:
result = ' '.join(title.replace('/',' ').split())
result = result.split()[0].replace('「预告」','')
return result.strip()
except:
return ''
def getStudio(self, htmltree):
return '91制片厂'
def getTags(self, htmltree):
return self.getTreeAll(htmltree, self.expr_tags)
def getActors(self, htmltree):
b=[]
for player in self.getTreeAll(htmltree, self.expr_actor):
player = player.replace('主演:','')
if '/' in player:
player = player.split('/')[0]
player = re.sub(r'[0-9]+', '', player)
b.append(player)
return b
def getRelease(self, htmltree):
try:
result = super().getRelease(htmltree)
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
pass
return ''
def getCover(self, htmltree):
try:
url = str(re.findall('var pic_url = "(.*?)"', self.htmlcode)[0])
return url.strip()
except:
return ''

265
scrapinglib/parser.py Normal file
View File

@@ -0,0 +1,265 @@
# -*- coding: utf-8 -*-
import json
import re
from lxml import etree, html
from . import httprequest
from .utils import getTreeElement, getTreeAll
class Parser:
source = 'base'
# poster: `0` 复制 `1` 裁剪
imagecut = 1
uncensored = False
allow_number_change = False
# update
proxies = None
verify = None
extraheader = None
cookies = None
morestoryline = False
number = ''
detailurl = ''
# xpath expr
expr_number = ''
expr_title = ''
expr_studio = ''
expr_studio2 = ''
expr_runtime = ''
expr_runtime2 = ''
expr_release = ''
expr_outline = ''
expr_director = ''
expr_actor = ''
expr_tags = ''
expr_label = ''
expr_label2 = ''
expr_series = ''
expr_series2 = ''
expr_cover = ''
expr_cover2 = ''
expr_smallcover = ''
expr_extrafanart = ''
expr_trailer = ''
expr_actorphoto = ''
expr_uncensored = ''
expr_userrating = ''
expr_uservotes = ''
def __init__(self) -> None:
pass
def scrape(self, number, core: None):
""" 刮削番号
"""
self.updateCore(core)
result = self.search(number)
return result
def search(self, number):
self.number = number
self.detailurl = self.queryNumberUrl(number)
htmltree = self.getHtmlTree(self.detailurl)
result = self.dictformat(htmltree)
return result
def updateCore(self, core):
""" 从`core`内更新参数
针对需要传递的参数: cookies, proxy等
子类继承后修改
"""
if core.proxies:
self.proxies = core.proxies
if core.verify:
self.verify = core.verify
if core.morestoryline:
self.morestoryline = True
def queryNumberUrl(self, number):
""" 根据番号查询详细信息url
备份查询页面,预览图可能需要
"""
url = httprequest.get(number)
return url
def getHtml(self, url, type = None):
""" 访问网页
"""
resp = httprequest.get(url, cookies=self.cookies, proxies=self.proxies, extra_headers=self.extraheader, verify=self.verify, return_type=type)
if '<title>404 Page Not Found' in resp \
or '<title>未找到页面' in resp \
or '404 Not Found' in resp \
or '<title>404' in resp \
or '<title>お探しの商品が見つかりません' in resp:
return 404
return resp
def getHtmlTree(self, url, type = None):
""" 访问网页,返回`etree`
"""
resp = self.getHtml(url, type)
if resp == 404:
return 404
ret = etree.fromstring(resp, etree.HTMLParser())
return ret
def dictformat(self, htmltree):
try:
dic = {
'number': self.getNum(htmltree),
'title': self.getTitle(htmltree),
'studio': self.getStudio(htmltree),
'year': self.getYear(htmltree),
'outline': self.getOutline(htmltree),
'runtime': self.getRuntime(htmltree),
'director': self.getDirector(htmltree),
'actor': self.getActors(htmltree),
'release': self.getRelease(htmltree),
'cover': self.getCover(htmltree),
'cover_small': self.getSmallCover(htmltree),
'extrafanart': self.getExtrafanart(htmltree),
'trailer': self.getTrailer(htmltree),
'imagecut': self.imagecut,
'tag': self.getTags(htmltree),
'label': self.getLabel(htmltree),
'actor_photo': self.getActorPhoto(htmltree),
'website': self.detailurl,
'source': self.source,
'series': self.getSeries(htmltree),
'uncensored': self.getUncensored(htmltree),
'userrating': self.getUserRating(htmltree),
'uservotes': self.getUserVotes(htmltree)
}
dic = self.extradict(dic)
except Exception as e:
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
def extradict(self, dic:dict):
""" 额外修改dict
"""
return dic
def getNum(self, htmltree):
""" 增加 strip 过滤
"""
return self.getTreeElement(htmltree, self.expr_number)
def getTitle(self, htmltree):
return self.getTreeElement(htmltree, self.expr_title).strip()
def getStudio(self, htmltree):
try:
return self.getTreeElement(htmltree, self.expr_studio).strip(" ['']")
except:
pass
try:
return self.getTreeElement(htmltree, self.expr_studio2).strip(" ['']")
except:
return ''
def getYear(self, htmltree):
""" year基本都是从release中解析的
"""
try:
release = self.getRelease(htmltree)
return str(re.findall('\d{4}', release)).strip(" ['']")
except:
return release
def getRuntime(self, htmltree):
try:
return self.getTreeElement(htmltree, self.expr_runtime).strip("\n\t ['']").rstrip('mi')
except:
pass
try:
return self.getTreeElement(htmltree, self.expr_runtime2).strip("\n\t ['']").rstrip('mi')
except:
return ''
def getRelease(self, htmltree):
return self.getTreeElement(htmltree, self.expr_release).strip().replace('/','-')
def getOutline(self, htmltree):
return self.getTreeElement(htmltree, self.expr_outline).strip().replace("\n","")
def getDirector(self, htmltree):
return self.getTreeElement(htmltree, self.expr_director)
def getActors(self, htmltree):
return self.getTreeAll(htmltree, self.expr_actor)
def getTags(self, htmltree):
return self.getTreeElement(htmltree, self.expr_tags)
def getLabel(self, htmltree):
try:
return self.getTreeElement(htmltree, self.expr_label).strip(" ['']")
except:
pass
try:
return self.getTreeElement(htmltree, self.expr_label2).strip(" ['']")
except:
return ''
def getSeries(self, htmltree):
try:
return self.getTreeElement(htmltree, self.expr_series).strip(" ['']")
except:
pass
try:
return self.getTreeElement(htmltree, self.expr_series2).strip(" ['']")
except:
return ''
def getCover(self, htmltree):
try:
return self.getTreeElement(htmltree, self.expr_cover).strip(" ['']")
except:
pass
try:
return self.getTreeElement(htmltree, self.expr_cover2).strip(" ['']")
except:
return ''
def getSmallCover(self, htmltree):
return self.getTreeElement(htmltree, self.expr_smallcover)
def getExtrafanart(self, htmltree):
return self.getTreeAll(htmltree, self.expr_extrafanart)
def getTrailer(self, htmltree):
return self.getTreeElement(htmltree, self.expr_trailer)
def getActorPhoto(self, htmltree):
return self.getTreeAll(htmltree, self.expr_actorphoto)
def getUncensored(self, htmlree):
if self.expr_uncensored:
u = self.getTreeAll(htmlree, self.expr_uncensored)
return bool(u)
else:
return self.uncensored
def getUserRating(self, htmltree):
return self.getTreeAll(htmltree, self.expr_userrating)
def getUserVotes(self, htmltree):
return self.getTreeAll(htmltree, self.expr_uservotes)
def getTreeElement(self, tree: html.HtmlElement, expr, index=0):
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
"""
return getTreeElement(tree, expr, index)
def getTreeAll(self, tree: html.HtmlElement, expr):
""" 根据表达式从`xmltree`中获取全部匹配值
"""
return getTreeAll(tree, expr)

View File

@@ -1,16 +1,29 @@
import sys
sys.path.append('../')
# -*- coding: utf-8 -*-
"""
此部分暂未修改
"""
import os
import re
import time
import secrets
import builtins
from ADC_function import *
from urllib.parse import urljoin
from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher
from unicodedata import category
from number_parser import is_uncensored
from .httprequest import get_html_by_browser, get_html_by_form, get_html_by_scraper, get_html_session
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
# 舍弃 Amazon 源
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "58avgo"}
G_mode_txt = ('顺序执行','线程池')
def is_japanese(raw: str) -> bool:
"""
日语简单检测
"""
return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', raw, re.UNICODE))
class noThread(object):
def map(self, fn, param):
@@ -22,18 +35,14 @@ class noThread(object):
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None, 无码=None):
def getStoryline(number, title = None, sites: list=None, uncensored=None):
start_time = time.time()
conf = config.getInstance()
if not conf.is_storyline():
return ''
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
if unc:
storyine_sites += conf.storyline_uncensored_site().split(',')
debug = False
storyine_sites = "1:avno1,4:airavwiki".split(',')
if uncensored:
storyine_sites += "3:58avgo".split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
storyine_sites += "2:airav,5:xcity".split(',')
r_dup = set()
sort_sites = []
for s in storyine_sites:
@@ -47,18 +56,11 @@ def getStoryline(number, title, sites: list=None, 无码=None):
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = 1 if conf.storyline_mode() > 0 else 0
run_mode = 1
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args)
sel = ''
if not debug and conf.storyline_show() == 0:
for value in results:
if isinstance(value, str) and len(value):
if not is_japanese(value):
return value
if not len(sel):
sel = value
return sel
# 以下debug结果输出会写入日志
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
sel_site = ''
@@ -72,7 +74,7 @@ def getStoryline(number, title, sites: list=None, 无码=None):
for site, desc in zip(apply_sites, results):
sl = len(desc) if isinstance(desc, str) else 0
s += f'[选中{site}字数:{sl}]' if site == sel_site else f'{site}字数:{sl}' if sl else f'{site}:空'
# print(s)
print(s)
return sel
@@ -91,8 +93,8 @@ def getStoryline_mp(args):
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
# elif site == "amazon":
# storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
@@ -287,126 +289,3 @@ def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
pass
return ''
def getStoryline_amazon(q_title, number, debug):
if not isinstance(q_title, str) or not len(q_title):
return None
try:
cookie, cookies_filepath = load_cookies('amazon.json')
url = "https://www.amazon.co.jp/s?k=" + q_title
res, session = get_html_session(url, cookies=cookie, return_type='session')
if not res:
raise ValueError("get_html_session() failed")
lx = fromstring(res.text)
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
if len(lks) and lks[0].startswith('/'):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
if not isinstance(idx, int) or idx < 0:
raise ValueError("title and number not found")
furl = urljoin(res.url, urls[idx])
res = session.get(furl)
if not res.ok:
raise ValueError("browser.open_relative()) failed.")
lx = fromstring(res.text)
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
if len(lks) and lks[0].startswith('/'):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
ama_t = ' '.join(p1) + ' '.join(p2)
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
if cookie is None:
# 删除无效cookies无论是用户创建还是自动创建以避免持续故障
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
ama_save = Path.home() / ".local/share/mdc/amazon.json"
ama_save.parent.mkdir(parents=True, exist_ok=True)
ama_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
return ama_t
except Exception as e:
if debug:
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
pass
return None
# 查货架中DVD和蓝光商品中标题相似度高的
def amazon_select_one(a_titles, q_title, number, debug):
sel = -1
ratio = 0
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
for tloc, title in enumerate(a_titles):
if re.search(number, title, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
return tloc
if not re.search('DVD|Blu-ray', title, re.I):
continue
ama_t = str(re.sub('DVD|Blu-ray', "", title, re.I))
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
findlen = 0
lastpos = -1
for cloc, char in reversed(tuple(enumerate(ama_t))):
pos = que_t.rfind(char)
if lastpos >= 0:
pos_near = que_t[:lastpos].rfind(char)
if pos_near < 0:
findlen = 0
lastpos = -1
ama_t = ama_t[:cloc+1]
else:
pos = pos_near
if pos < 0:
if category(char) == 'Nd':
return -1
if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U):
return -1
ama_t = ama_t[:cloc]
findlen = 0
lastpos = -1
continue
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
findlen += 1
lastpos = pos
if findlen >= 4:
break
continue
findlen = 1
lastpos = pos
if findlen==0:
return -1
r = SequenceMatcher(None, ama_t, que_t).ratio()
if r > ratio:
sel = tloc
ratio = r
save_t_ = ama_t
if ratio > 0.999:
break
if ratio < 0.5:
return -1
if not debug:
# 目前采信相似度高于0.9的结果
return sel if ratio >= 0.9 else -1
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1
# 被采信的结果日志
with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel

40
scrapinglib/tmdb.py Normal file
View File

@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
from .parser import Parser
class Tmdb(Parser):
"""
两种实现,带apikey与不带key
apikey
"""
source = 'tmdb'
imagecut = 0
apikey = None
expr_title = '//head/meta[@property="og:title"]'
expr_release = '//div/span[@class="release"]/text()'
expr_cover = '//head/meta[@property="og:image"]'
expr_outline = '//head/meta[@property="og:description"]'
# def search(self, number):
# self.detailurl = self.queryNumberUrl(number)
# detailpage = self.getHtml(self.detailurl)
def queryNumberUrl(self, number):
"""
TODO 区分 ID 与 名称
"""
id = number
movieUrl = "https://www.themoviedb.org/movie/" + id + "?language=zh-CN"
return movieUrl
def getTitle(self, htmltree):
return self.getTreeElement(htmltree, self.expr_title).get('content')
def getCover(self, htmltree):
return "https://www.themoviedb.org" + self.getTreeElement(htmltree, self.expr_cover).get('content')
def getOutline(self, htmltree):
return self.getTreeElement(htmltree, self.expr_outline).get('content')

31
scrapinglib/utils.py Normal file
View File

@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
from lxml.html import HtmlElement
def getTreeElement(tree: HtmlElement, expr, index=0):
""" 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
:param tree (html.HtmlElement)
:param expr
:param index
"""
if expr == '':
return ''
result = tree.xpath(expr)
try:
return result[index]
except:
return ''
def getTreeAll(tree: HtmlElement, expr):
""" 根据表达式从`xmltree`中获取全部匹配值
:param tree (html.HtmlElement)
:param expr
:param index
"""
if expr == '':
return ''
result = tree.xpath(expr)
try:
return result
except:
return ''

122
scrapinglib/xcity.py Normal file
View File

@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
import re
import secrets
from urllib.parse import urljoin
from lxml import etree
from .httprequest import get_html_by_form
from .parser import Parser
class Xcity(Parser):
source = 'xcity'
expr_number = '//*[@id="hinban"]/text()'
expr_title = '//*[@id="program_detail_title"]/text()'
expr_studio = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()'
expr_studio2 = '//strong[contains(text(),"片商")]/../following-sibling::span/a/text()'
expr_runtime = '//span[@class="koumoku" and text()="収録時間"]/../text()'
expr_label = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()'
expr_release = '//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()'
expr_tags = '//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()'
expr_cover = '//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href'
expr_director = '//*[@id="program_detail_director"]/text()'
expr_series = "//span[contains(text(),'シリーズ')]/../a/span/text()"
expr_series2 = "//span[contains(text(),'シリーズ')]/../span/text()"
def getStudio(self, htmltree):
return super().getStudio(htmltree).strip('+').replace("', '", '').replace('"', '')
def getRuntime(self, htmltree):
return self.getTreeAll(htmltree, self.expr_runtime)[1].strip()
def getRelease(self, htmltree):
try:
result = self.getTreeElement(htmltree, self.expr_release, 1)
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
except:
return ''
def getTags(self, htmltree):
result = self.getTreeAll(htmltree, self.expr_tags)
total = []
for i in result:
total.append(i.replace("\n","").replace("\t",""))
return total
def getCover(self, htmltree):
try:
result = super().getCover(htmltree)
return 'https:' + result
except:
return ''
def getDirector(self, htmltree):
try:
result = super().getDirector(htmltree).replace(u'\n','').replace(u'\t', '')
return result
except:
return ''
def getOutline(self, htmltree):
if self.morestoryline:
from .storyline import getStoryline
return getStoryline(self.number, uncensored=False)
return ''
def getActors(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = []
for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(self, htmltree):
htmla = self.browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = {i.text.strip(): i['href'] for i in htmla}
o = {}
for k, v in t.items():
r = self.browser.open_relative(v)
if not r.ok:
continue
pic = self.browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
if 'noimage.gif' in pic['src']:
continue
o[k] = urljoin(self.browser.url, pic['src'])
return o
def getExtrafanart(self, htmltree):
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
html = html_pather.search(self.detail_page)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for urli in extrafanart_imgs:
urli = 'https:' + urli.replace('/scene/small', '')
s.append(urli)
return s
return ''
def open_by_browser(self, number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def search(self, number):
self.number = number
self.detail_page, self.browser = self.open_by_browser(number)
self.detailurl = self.browser.url
lx = etree.fromstring(self.detail_page, etree.HTMLParser())
result = self.dictformat(lx)
return result