remove webcrawler

This commit is contained in:
Mathhew
2022-05-27 17:06:31 +08:00
parent c1fd755ccb
commit feccd67115
20 changed files with 0 additions and 3494 deletions

View File

@@ -1,372 +0,0 @@
import json
import re
from multiprocessing.pool import ThreadPool
import ADC_function
import config
from ADC_function import translate
from lxml import etree
from pathlib import Path
# =========website========
from . import airav
from . import avsox
from . import fanza
from . import fc2
from . import jav321
from . import javbus
from . import javdb
from . import mgstage
from . import xcity
# from . import javlib
from . import dlsite
from . import carib
from . import fc2club
from . import mv91
from . import madou
from . import gcolle
from . import getchu
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True
def get_data_from_json(file_number, oCC):
"""
iterate through all services and fetch the data 从JSON返回元数据
"""
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
func_mapping = {
"airav": airav.main,
"avsox": avsox.main,
"fc2": fc2.main,
"fanza": fanza.main,
"javdb": javdb.main,
"javbus": javbus.main,
"mgstage": mgstage.main,
"jav321": jav321.main,
"xcity": xcity.main,
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main,
"mv91": mv91.main,
"madou": madou.main,
"gcolle": gcolle.main,
"getchu": getchu.main,
}
conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
def insert(sources,source):
if source in sources:
sources.insert(0, sources.pop(sources.index(source)))
return sources
if len(sources) <= len(func_mapping):
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
):
sources = insert(sources,"carib")
elif "item" in file_number or "GETCHU" in file_number.upper():
sources = insert(sources,"getchu")
elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
sources = insert(sources, "getchu")
sources = insert(sources, "dlsite")
elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "avsox" in sources:
sources = insert(sources,"avsox")
elif "mgstage" in sources and \
(re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
sources = insert(sources,"mgstage")
elif "fc2" in lo_file_number:
if "fc2" in sources:
sources = insert(sources,"fc2")
elif "gcolle" in sources and (re.search("\d{6}", file_number)):
sources = insert(sources,"gcolle")
elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
if "xcity" in sources:
sources = insert(sources,"xcity")
if "madou" in sources:
sources = insert(sources,"madou")
elif "madou" in sources and (
re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
):
sources = insert(sources,"madou")
# check sources in func_mapping
todel = []
for s in sources:
if not s in func_mapping:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
json_data = {}
if conf.multi_threading():
pool = ThreadPool(processes=len(conf.sources().split(',')))
# Set the priority of multi-thread crawling and join the multi-thread queue
for source in sources:
pool.apply_async(func_mapping[source], (file_number,))
# Get multi-threaded crawling response
for source in sources:
if conf.debug() == True:
print('[+]select', source)
try:
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
except:
json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
pool.close()
pool.terminate()
else:
for source in sources:
try:
if conf.debug() == True:
print('[+]select', source)
try:
json_data = json.loads(func_mapping[source](file_number))
except:
json_data = func_mapping[source](file_number)
# if any service return a valid return, break
if get_data_state(json_data):
print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
except:
break
# Return if data not found in all sources
if not json_data:
print('[-]Movie Number not found!')
return None
# 增加number严格判断避免提交任何number总是返回"本橋実来 ADZ335"这种返回number不一致的数据源故障
# 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
# 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
# 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分可考虑更换规则更新相应的number分析和抓取代码。
if str(json_data.get('number')).upper() != file_number.upper():
try:
if json_data.get('allow_number_change'):
pass
except:
print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
return None
# ================================================网站规则添加结束================================================
title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
director = json_data.get('director')
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')
if json_data.get('cover_small'):
cover_small = json_data.get('cover_small')
else:
cover_small = ''
if json_data.get('trailer'):
trailer = json_data.get('trailer')
else:
trailer = ''
if json_data.get('extrafanart'):
extrafanart = json_data.get('extrafanart')
else:
extrafanart = ''
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
while 'XXXX' in tag:
tag.remove('XXXX')
while 'xxx' in tag:
tag.remove('xxx')
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
print('[-]Movie Number or Title not found!')
return None
# if imagecut == '3':
# DownloadFileWithFilename()
# ====================处理异常字符====================== #\/:*?"<>|
actor = special_characters_replacement(actor)
actor_list = [special_characters_replacement(a) for a in actor_list]
title = special_characters_replacement(title)
label = special_characters_replacement(label)
outline = special_characters_replacement(outline)
series = special_characters_replacement(series)
studio = special_characters_replacement(studio)
director = special_characters_replacement(director)
tag = [special_characters_replacement(t) for t in tag]
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# 返回处理后的json_data
json_data['title'] = title
json_data['original_title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['year'] = year
json_data['actor_list'] = actor_list
json_data['trailer'] = trailer
json_data['extrafanart'] = extrafanart
json_data['label'] = label
json_data['outline'] = outline
json_data['series'] = series
json_data['studio'] = studio
json_data['director'] = director
if conf.is_translate():
translate_values = conf.translate_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
if translate_value == "title":
title_dict = json.loads(
(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
try:
json_data[translate_value] = title_dict[number]
continue
except:
pass
if conf.get_translate_engine() == "azure":
t = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_translate_engine(),
key=conf.get_translate_key(),
)
else:
t = translate(json_data[translate_value])
if len(t):
json_data[translate_value] = special_characters_replacement(t)
if oCC:
cc_vars = conf.cc_convert_vars().split(",")
ccm = conf.cc_convert_mode()
def convert_list(mapping_data,language,vars):
total = []
for i in vars:
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
total.append(i)
return total
def convert(mapping_data,language,vars):
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
else:
raise IndexError('keyword not found')
for cc in cc_vars:
if json_data[cc] == "" or len(json_data[cc]) == 0:
continue
if cc == "actor":
try:
if ccm == 1:
json_data['actor_list'] = convert_list(actor_mapping_data, "zh_cn", json_data['actor_list'])
json_data['actor'] = convert(actor_mapping_data, "zh_cn", json_data['actor'])
elif ccm == 2:
json_data['actor_list'] = convert_list(actor_mapping_data, "zh_tw", json_data['actor_list'])
json_data['actor'] = convert(actor_mapping_data, "zh_tw", json_data['actor'])
elif ccm == 3:
json_data['actor_list'] = convert_list(actor_mapping_data, "jp", json_data['actor_list'])
json_data['actor'] = convert(actor_mapping_data, "jp", json_data['actor'])
except:
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
json_data['actor'] = oCC.convert(json_data['actor'])
elif cc == "tag":
try:
if ccm == 1:
json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
except:
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
else:
try:
if ccm == 1:
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
except IndexError:
json_data[cc] = oCC.convert(json_data[cc])
except:
pass
naming_rule=""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
item = json_data.get(i)
naming_rule += item if type(item) is not list else "&".join(item)
json_data['naming_rule'] = naming_rule
return json_data
def special_characters_replacement(text) -> str:
if not isinstance(text, str):
return text
return (text.replace('\\', ''). # U+2216 SET MINUS @ Basic Multilingual Plane
replace('/', ''). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
replace(':', ''). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
replace('*', ''). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
replace('?', ''). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
replace('"', ''). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', ''). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', ''). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('&lsquo;', ''). # U+02018 LEFT SINGLE QUOTATION MARK
replace('&rsquo;', ''). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('&hellip;','').
replace('&amp;', '')
)

View File

@@ -1,227 +0,0 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup#need install
from ADC_function import *
from WebCrawler import javbus
'''
API
注册https://www.airav.wiki/api/auth/signup
设置https://www.airav.wiki/api/get_web_settings
搜索https://www.airav.wiki/api/video/list?lng=zh-CN&search=
搜索https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search=
'''
host = 'https://www.airav.wiki'
# airav这个网站没有演员图片所以直接使用javbus的图
def getActorPhoto(javbus_json):
result = javbus_json.get('actor_photo')
if isinstance(result, dict) and len(result):
return result
return ''
def getTitle(htmlcode): #获取标题
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
def getStudio(htmlcode, javbus_json): #获取厂商 已修改
# javbus如果有数据以它为准
result = javbus_json.get('studio')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode,etree.HTMLParser())
return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
def getYear(htmlcode, javbus_json): #获取年份
result = javbus_json.get('year')
if isinstance(result, str) and len(result):
return result
release = getRelease(htmlcode, javbus_json)
if len(release) != len('2000-01-01'):
return ''
return release[:4]
def getCover(htmlcode, javbus_json): #获取封面图片
result = javbus_json.get('cover')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
def getRelease(htmlcode, javbus_json): #获取出版日期
result = javbus_json.get('release')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
except:
return ''
return result
def getRuntime(javbus_json): #获取播放时长
result = javbus_json.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
# airav女优数据库较多日文汉字姓名javbus较多日语假名因此airav优先
def getActor(htmlcode, javbus_json): #获取女优
b=[]
html = etree.fromstring(htmlcode, etree.HTMLParser())
a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
for v in a:
v = v.strip()
if len(v):
b.append(v)
if len(b):
return b
result = javbus_json.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getNum(htmlcode, javbus_json): #获取番号
result = javbus_json.get('number')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('^\[(.*?)]', title)[0])
return result
def getDirector(javbus_json): #获取导演 已修改
result = javbus_json.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getOutline(htmlcode): #获取概述
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
return result
except:
return ''
def getSerise(javbus_json): #获取系列 已修改
result = javbus_json.get('series')
if isinstance(result, str) and len(result):
return result
return ''
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
a = x[0].find_all('a')
for i in a:
tag.append(i.get_text())
return tag
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def search(keyword): #搜索,返回结果
result = []
page = 1
while page > 0:
# search_result = {"offset": 0,"count": 4,"result": [
# {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
# "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"},
# {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜",
# "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"},
# {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希",
# "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"},
# {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
# "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"}
# ],"status": "ok"}
search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page))
try:
json_data = json.loads(search_result)
except json.decoder.JSONDecodeError:
# print("[-]Json decoder error!")
return []
result_offset = int(json_data["offset"])
result_count = int(json_data["count"])
result_size = len(json_data["result"])
if result_count <= 0 or result_size <= 0:
return result
elif result_count > result_offset + result_size: #请求下一页内容
result.extend(json_data["result"])
page += 1
elif result_count == result_offset + result_size: #请求最后一页内容
result.extend(json_data["result"])
page = 0
else:
page = 0
return result
def main(number):
try:
try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
javbus_json = json.loads(javbus.main(number))
except:
# print(number)
pass
dic = {
# 标题可使用airav
'title': getTitle(htmlcode),
# 制作商先找javbus如果没有再找本站
'studio': getStudio(htmlcode, javbus_json),
# 年份先试javbus如果没有再找本站
'year': getYear(htmlcode, javbus_json),
# 简介 使用 airav
'outline': getOutline(htmlcode),
# 使用javbus
'runtime': getRuntime(javbus_json),
# 导演 使用javbus
'director': getDirector(javbus_json),
# 演员 先试airav
'actor': getActor(htmlcode, javbus_json),
# 发售日先试javbus
'release': getRelease(htmlcode, javbus_json),
# 番号使用javbus
'number': getNum(htmlcode, javbus_json),
# 封面链接 使用javbus
'cover': getCover(htmlcode, javbus_json),
# 剧照获取
'extrafanart': getExtrafanart(htmlcode),
'imagecut': 1,
# 使用 airav
'tag': getTag(htmlcode),
# 使用javbus
'label': getSerise(javbus_json),
'actor_photo': getActorPhoto(javbus_json),
'website': 'https://www.airav.wiki/video/' + number,
'source': 'airav.py',
# 使用javbus
'series': getSerise(javbus_json)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
print(main('ADV-R0624')) # javbus页面返回404, airav有数据
print(main('ADN-188')) # 一人
print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名airav采用日文汉字

View File

@@ -1,86 +0,0 @@
import sys
sys.path.append('..')
from ADC_function import *
from WebCrawler.storyline import getStoryline
from WebCrawler.crawler import *
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getActorPhoto(html):
a = html.xpath('//a[@class="avatar-box"]')
d = {}
for i in a:
l = i.find('.//img').attrib['src']
t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d
def getActor(html):
a = html.xpath('//a[@class="avatar-box"]')
d = []
for i in a:
d.append(i.find('span').text)
return d
def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
def getTag(html):
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x[2:]] if len(x) > 2 else []
def main(number):
html = get_html('https://tellme.pw/avsox')
site = Crawler(html).getString('//div[@class="container"]/div/a/@href')
a = get_html(site + '/cn/search/' + number)
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', ''))
html = Crawler(a)
result1 = html.getString('//*[@id="waterfall"]/div/a/@href')
detail = get_html("https:" + result1)
lx = etree.fromstring(detail, etree.HTMLParser())
avsox_crawler2 = Crawler(a)
avsox_crawler = Crawler(detail)
try:
new_number = avsox_crawler.getString('//span[contains(text(),"识别码:")]/../span[2]/text()')
if new_number.upper() != number.upper():
raise ValueError('number not found')
title = avsox_crawler.getString('/html/body/div[2]/h3/text()').replace('/','').strip(new_number)
dic = {
'actor': getActor(lx),
'title': title,
'studio': avsox_crawler.getString('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()').replace("', '",' '),
'outline': getStoryline(number, title),
'runtime': avsox_crawler.getString('//span[contains(text(),"长度:")]/../text()').replace('分钟',''),
'director': '', #
'release': avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'),
'number': new_number,
'cover': avsox_crawler.getString('/html/body/div[2]/div[1]/div[1]/a/img/@src'),
#'cover_small' : getCover_small(html),
'cover_small': avsox_crawler2.getString('//*[@id="waterfall"]/div/a/div[1]/img/@src'),
'imagecut': 3,
'tag': getTag(lx),
'label': avsox_crawler.getString('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()'),
'year': re.findall('\d{4}',avsox_crawler.getString('//span[contains(text(),"发行时间:")]/../text()'))[0],
'actor_photo': getActorPhoto(lx),
'website': "https:" + result1,
'source': 'avsox.py',
'series': avsox_crawler.getString('//span[contains(text(),"系列:")]/../span[2]/text()'),
}
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == "__main__":
print(main('012717_472'))
print(main('1')) # got fake result raise 'number not found'

View File

@@ -1,133 +0,0 @@
import sys
sys.path.append('../')
from lxml import html
from ADC_function import *
from WebCrawler.storyline import getStoryline
G_SITE = 'https://www.caribbeancom.com'
def main(number: str) -> json:
try:
url = f'{G_SITE}/moviepages/{number}/index.html'
result, session = get_html_session(url, return_type='session')
htmlcode = result.content.decode('euc-jp')
if not result or not htmlcode or '<title>404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found")
lx = html.fromstring(htmlcode)
title = get_title(lx)
dic = {
'title': title,
'studio': '加勒比',
'year': get_year(lx),
'outline': get_outline(lx, number, title),
'runtime': get_runtime(lx),
'director': '',
'actor': get_actor(lx),
'release': get_release(lx),
'number': number,
'cover': f'{G_SITE}/moviepages/{number}/images/l_l.jpg',
'tag': get_tag(lx),
'extrafanart': get_extrafanart(lx),
'label': get_series(lx),
'imagecut': 1,
'website': f'{G_SITE}/moviepages/{number}/index.html',
'source': 'carib.py',
'series': get_series(lx),
'无码': True
}
if config.getInstance().download_actor_photo_for_kodi():
dic['actor_photo'] = get_actor_photo(lx, session)
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
g = getStoryline(number, title, 无码=True)
if len(g):
return g
return o
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
def get_actor(lx: html.HtmlElement):
r = []
actors = lx.xpath("//span[@class='spec-content']/a[@itemprop='actor']/span/text()")
for act in actors:
if str(act) != '':
r.append(act)
return r
def get_tag(lx: html.HtmlElement) -> str:
genres = lx.xpath("//span[@class='spec-content']/a[@itemprop='genre']/text()")
return genres
def get_extrafanart(lx: html.HtmlElement) -> str:
r = []
genres = lx.xpath("//*[@id='sampleexclude']/div[2]/div/div[@class='grid-item']/div/a/@href")
for g in genres:
jpg = str(g)
if '/member/' in jpg:
break
else:
r.append('https://www.caribbeancom.com' + jpg)
return r
def get_series(lx: html.HtmlElement) -> str:
try:
return str(lx.xpath("//span[@class='spec-title'][contains(text(),'シリーズ')]/../span[@class='spec-content']/a/text()")[0]).strip()
except:
return ''
def get_runtime(lx: html.HtmlElement) -> str:
return str(lx.xpath("//span[@class='spec-content']/span[@itemprop='duration']/text()")[0]).strip()
def get_actor_photo(lx, session):
htmla = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']")
names = lx.xpath("//*[@id='moviepages']/div[@class='container']/div[@class='inner-container']/div[@class='movie-info section']/ul/li[@class='movie-spec']/span[@class='spec-content']/a[@itemprop='actor']/span[@itemprop='name']/text()")
t = {}
for name, a in zip(names, htmla):
if name.strip() == '':
continue
p = {name.strip(): a.attrib['href']}
t.update(p)
o = {}
for k, v in t.items():
if '/search_act/' not in v:
continue
r = session.get(urljoin(G_SITE, v))
if not r.ok:
continue
html = r.text
pos = html.find('.full-bg')
if pos<0:
continue
css = html[pos:pos+100]
cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
p = {k: urljoin(r.url, cssBGjpgs[0])}
o.update(p)
return o
if __name__ == "__main__":
print(main("070116-197")) # actor have photo
print(main("041721-001"))
print(main("080520-001"))

View File

@@ -1,28 +0,0 @@
from lxml import etree
class Crawler:
def __init__(self,htmlcode):
self.html = etree.HTML(htmlcode)
def getString(self,_xpath):
if _xpath == "":
return ""
result = self.html.xpath(_xpath)
try:
return result[0]
except:
return ""
def getStrings(self,_xpath):
result = self.html.xpath(_xpath)
try:
return result
except:
return ""
def getOutline(self,_xpath):
result = self.html.xpath(_xpath)
try:
return "\n".join(result)
except:
return ""

View File

@@ -1,185 +0,0 @@
import re
from lxml import etree
import json
import sys
sys.path.append('../')
from ADC_function import *
def getTitle(html):
result = str(html.xpath('/html/head/title/text()')[0])
result = result[:result.rfind(' | DLsite')]
result = result[:result.rfind(' [')]
result = result.replace('【HD版】', '')
return result
def getActor(html): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
result1 = ''
return result1
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
d={}
for i in a:
p={i:''}
d.update(p)
return d
def getStudio(html):
try:
try:
result = html.xpath('//th[contains(text(),"商标名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getYear(getRelease):
try:
result = str(re.search('\d{4}', getRelease).group())
return result
except:
return getRelease
def getRelease(html):
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
return result1.replace('','-').replace('','-').replace('','')
def getTag(html):
try:
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result
except:
return ''
def getCover_small(a, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(html):
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li[1]/picture/source/@srcset')[0]
return result.replace('.webp', '.jpg')
def getDirector(html):
try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result
def getOutline(html):
total = []
result = html.xpath('//*[@class="work_parts_area"]/p/text()')
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(html):
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
#
def getExtrafanart(html):
try:
result = []
for i in html.xpath('//*[@id="work_left"]/div/div/div[1]/div/@data-src'):
result.append("https:" + i)
except:
result = ''
return result
def main(number):
try:
if "RJ" in number or "VJ" in number:
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html/?locale=zh_CN', cookies={'locale': 'zh-cn'})
html = etree.fromstring(htmlcode, etree.HTMLParser())
else:
htmlcode = get_html(f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'})
html = etree.HTML(htmlcode)
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace("THE ANIMATION", "").replace("he Animation", "").replace("t", "").replace("T","")
html = etree.HTML(get_html(
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
if "" in number:
number = number.replace("","")
elif "" in number:
number = number.replace("","")
html = etree.HTML(get_html(
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
if len(search_result) == 0:
number = number.replace('上巻', '').replace('下巻', '').replace('前編', '').replace('後編', '')
html = etree.HTML(get_html(
f'https://www.dlsite.com/maniax/fsr/=/language/jp/sex_category/male/keyword/{number}/order/trend/work_type_category/movie', cookies={'locale': 'zh-cn'}))
search_result = html.xpath('//*[@id="search_result_img_box"]/li[1]/dl/dd[2]/div[2]/a/@href')
a = search_result[0]
html = etree.HTML(get_html(a,cookies={'locale': 'zh-cn'}))
number = str(re.findall("\wJ\w+",a)).strip(" [']")
dic = {
'actor': getStudio(html),
'title': getTitle(html),
'studio': getStudio(html),
'outline': getOutline(html),
'runtime': '',
'director': getDirector(html),
'release': getRelease(html),
'number': number,
'cover': 'https:' + getCover(html),
'cover_small': '',
'imagecut': 4,
'tag': getTag(html),
'label': getLabel(html),
'year': getYear(getRelease(html)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/maniax/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(html),
'extrafanart':getExtrafanart(html),
'allow_number_change':True,
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.getInstance().set_override("debug_mode:switch=1")
print(main('牝教師4穢された教壇 「生意気ドジっ娘女教師・美結高飛車ハメ堕ち2濁金」'))
print(main('RJ329607'))

View File

@@ -1,190 +0,0 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
from urllib.parse import urlencode
from ADC_function import *
from WebCrawler.crawler import *
class fanzaCrawler(Crawler):
def getFanzaString(self,string):
result1 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/a/text()")).strip(" ['']")
result2 = str(self.html.xpath("//td[contains(text(),'"+string+"')]/following-sibling::td/text()")).strip(" ['']")
return result1+result2
def getFanzaStrings(self, string):
result1 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/a/text()")
if len(result1) > 0:
return result1
result2 = self.html.xpath("//td[contains(text(),'" + string + "')]/following-sibling::td/text()")
return result2
def getRelease(fanza_Crawler):
result = fanza_Crawler.getFanzaString('発売日:')
if result == '' or result == '----':
result = fanza_Crawler.getFanzaString('配信開始日:')
return result.replace("/", "-").strip('\\n')
def getCover(html, number):
cover_number = number
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# sometimes fanza modify _ to \u0005f for image id
if "_" in cover_number:
cover_number = cover_number.replace("_", r"\u005f")
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# (TODO) handle more edge case
# print(html)
# raise exception here, same behavior as before
# people's major requirement is fetching the picture
raise ValueError("can not find image")
return result
def getOutline(html):
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace("\n", "")
if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace("\n", "")
except:
# (TODO) handle more edge case
# print(html)
return ""
return result
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-image-block\"[\s\S]*?<br></div>\n</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for img_url in extrafanart_imgs:
img_urls = img_url.rsplit('-', 1)
img_url = img_urls[0] + 'jp-' + img_urls[1]
s.append(img_url)
return s
return ''
def main(number):
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
"https://www.dmm.co.jp/rental/-/detail/=/cid=",
]
chosen_url = ""
fanza_Crawler = ''
for url in fanza_urls:
chosen_url = url + fanza_search_number
htmlcode = get_html(
"https://www.dmm.co.jp/age_check/=/declared=yes/?{}".format(
urlencode({"rurl": chosen_url})
)
)
fanza_Crawler = fanzaCrawler(htmlcode)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
return json.dumps({"title": "",})
try:
# for some old page, the input number does not match the page
# for example, the url will be cid=test012
# but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions
fanza_hinban = fanza_Crawler.getFanzaString('品番:')
out_num = fanza_hinban
number_lo = number.lower()
html = etree.fromstring(htmlcode, etree.HTMLParser())
if (re.sub('-|_', '', number_lo) == fanza_hinban or
number_lo.replace('-', '00') == fanza_hinban or
number_lo.replace('-', '') + 'so' == fanza_hinban
):
out_num = number
data = {
"title": fanza_Crawler.getString('//*[starts-with(@id, "title")]/text()').strip(),
"studio": fanza_Crawler.getFanzaString('メーカー'),
"outline": getOutline(html),
"runtime": str(re.search(r'\d+',fanza_Crawler.getString("//td[contains(text(),'収録時間')]/following-sibling::td/text()")).group()).strip(" ['']"),
"director": fanza_Crawler.getFanzaString('監督:') if "anime" not in chosen_url else "",
"actor": fanza_Crawler.getString("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()").replace("', '", ",") if "anime" not in chosen_url else "",
"release": getRelease(fanza_Crawler),
"number": out_num,
"cover": getCover(html, fanza_hinban),
"imagecut": 1,
"tag": fanza_Crawler.getFanzaStrings('ジャンル:'),
"extrafanart": getExtrafanart(htmlcode),
"label": fanza_Crawler.getFanzaString('レーベル'),
"year": re.findall('\d{4}',getRelease(fanza_Crawler))[0], # str(re.search('\d{4}',getRelease(a)).group()),
"actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
"series": fanza_Crawler.getFanzaString('シリーズ:'),
}
except:
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) # .encode('UTF-8')
return js
def main_htmlcode(number):
# fanza allow letter + number + underscore, normalize the input here
# @note: I only find the usage of underscore as h_test123456789
fanza_search_number = number
# AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
if fanza_search_number.startswith("h-"):
fanza_search_number = fanza_search_number.replace("h-", "h_")
fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/videoc/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/nikkatsu/-/detail/=/cid=",
]
chosen_url = ""
for url in fanza_urls:
chosen_url = url + fanza_search_number
htmlcode = get_html(chosen_url)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
return json.dumps({"title": "",})
return htmlcode
if __name__ == "__main__":
# print(main("DV-1562"))
# print(main("96fad1217"))
print(main("pred00251"))
print(main("MIAA-391"))
print(main("OBA-326"))

View File

@@ -1,80 +0,0 @@
import sys
sys.path.append('../')
import re
import json
import config
import ADC_function
from WebCrawler.crawler import *
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<ul class=\"items_article_SampleImagesArea\"[\s\S]*?</ul>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def getTrailer(htmlcode, number):
video_pather = re.compile(r'\'[a-zA-Z0-9]{32}\'')
video = video_pather.findall(htmlcode)
if video:
try:
video_url = video[0].replace('\'', '')
video_url = 'https://adult.contents.fc2.com/api/v2/videos/' + number + '/sample?key=' + video_url
url_json = eval(ADC_function.get_html(video_url))['path'].replace('\\', '')
return url_json
except:
return ''
else:
return ''
def main(number):
try:
number = number.replace('FC2-', '').replace('fc2-', '')
htmlcode2 = ADC_function.get_html('https://adult.contents.fc2.com/article/' + number + '/', encoding='utf-8')
fc2_crawler = Crawler(htmlcode2)
actor = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')
if actor == "":
actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = fc2_crawler.getString("//div[@class='items_article_MainitemThumb']/span/img/@src")
cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
release = fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/div[2]/p/text()').\
strip(" ['販売日 : ']").replace('/','-')
dic = {
'title': fc2_crawler.getString('/html/head/title/text()'),
'studio': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'year': re.findall('\d{4}',release)[0],
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
'director': fc2_crawler.getString('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()'),
'actor': actor,
'release': release,
'number': 'FC2-' + number,
'label': '',
'cover': cover,
'thumb': cover,
'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2, number),
'imagecut': 0,
'tag': fc2_crawler.getStrings("//a[@class='tag tagTag']/text()"),
'actor_photo': '',
'website': 'https://adult.contents.fc2.com/article/' + number + '/',
'source': 'https://adult.contents.fc2.com/article/' + number + '/',
'series': '',
}
except Exception as e:
if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
config.getInstance().set_override("debug_mode:switch=1")
#print(main('FC2-2182382'))
#print(main('FC2-607854'))
print(main('FC2-2787433'))

View File

@@ -1,96 +0,0 @@
import sys
sys.path.append('../')
from lxml import etree#need install
import json
import ADC_function
def getTitle_fc2com(htmlcode): #获取标题
html = etree.fromstring(htmlcode,etree.HTMLParser())
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']")
return result
def getActor_fc2com(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[5]/a/text()')).strip(" ['']")
return result
except:
return ''
def getStudio_fc2com(htmlcode): #获取厂商
try:
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h5[3]/a[1]/text()')).strip(" ['']")
return result
except:
return ''
def getNum_fc2com(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('//*[@class="show-top-grids"]/div[1]/h3/text()')).strip(" ['']")
num = title.split(' ')[0]
if num.startswith('FC2') != True:
num = ''
return num
def getRelease_fc2com(htmlcode2): #
return ''
def getCover_fc2com(htmlcode2): #获取img #
html = etree.fromstring(htmlcode2, etree.HTMLParser())
imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']")
imgUrl = imgUrl.replace('../','https://fc2club.net/')
return imgUrl
def getTag_fc2com(htmlcode): #获取tag
html = etree.fromstring(htmlcode,etree.HTMLParser())
a = html.xpath('//*[@class="show-top-grids"]/div[1]/h5[4]/a')
tag = []
for i in range(len(a)):
tag.append(str(a[i].xpath('text()')).strip("['']"))
return tag
def getYear_fc2com(release):
return ''
def getExtrafanart(htmlcode): # 获取剧照
html = etree.fromstring(htmlcode, etree.HTMLParser())
imgUrl = str(html.xpath('//*[@class="slides"]/li[1]/img/@src')).strip(" ['']")
imgUrl = imgUrl.replace('../','https://fc2club.net/')
return imgUrl
def getTrailer(htmlcode):
return ''
def main(number):
try:
number = number.replace('FC2-', '').replace('fc2-', '')
webUrl = 'https://fc2club.net/html/FC2-' + number + '.html'
htmlcode2 = ADC_function.get_html(webUrl)
actor = getActor_fc2com(htmlcode2)
if getActor_fc2com(htmlcode2) == '':
actor = 'FC2系列'
dic = {
'title': getTitle_fc2com(htmlcode2),
'studio': getStudio_fc2com(htmlcode2),
'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': '',
'director': getStudio_fc2com(htmlcode2),
'actor': actor,
'release': getRelease_fc2com(htmlcode2),
'number': 'FC2-' + number,
'label': '',
'cover': getCover_fc2com(htmlcode2),
'extrafanart': getExtrafanart(htmlcode2),
"trailer": getTrailer(htmlcode2),
'imagecut': 0,
'tag': getTag_fc2com(htmlcode2),
'actor_photo': '',
'website': 'https://fc2club.net/html/FC2-' + number + '.html/',
'source': 'https://fc2club.net/html/FC2-' + number + '.html/',
'series': '',
}
except Exception as e:
if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
print(main('FC2-402422'))

View File

@@ -1,86 +0,0 @@
import sys
sys.path.append('../')
from WebCrawler.crawler import *
from ADC_function import *
def main(number):
save_cookies = False
cookie_filename = 'gcolle.json'
try:
gcolle_cooikes, cookies_filepath = load_cookies(cookie_filename)
session = get_html_session(cookies=gcolle_cooikes)
number = number.upper().replace('GCOLLE-','')
htmlcode = session.get('https://gcolle.net/product_info.php/products_id/' + number).text
gcolle_crawler = Crawler(htmlcode)
r18_continue = gcolle_crawler.getString('//*[@id="main_content"]/table[1]/tbody/tr/td[2]/table/tbody/tr/td/h4/a[2]/@href')
if r18_continue and r18_continue.startswith('http'):
htmlcode = session.get(r18_continue).text
gcolle_crawler = Crawler(htmlcode)
save_cookies = True
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
number_html = gcolle_crawler.getString('//td[contains(text(),"商品番号")]/../td[2]/text()')
if number != number_html:
raise Exception('[-]gcolle.py: number not match')
if save_cookies:
cookies_save = Path.home() / f".local/share/mdc/{cookie_filename}"
cookies_save.parent.mkdir(parents=True, exist_ok=True)
cookies_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
# get extrafanart url
if len(gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')) == 0:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src')
else:
extrafanart = gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[3]/td/div/img/@src')
# Add "https:" in each extrafanart url
for i in range(len(extrafanart)):
extrafanart[i] = 'https:' + extrafanart[i]
dic = {
"title": gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[1]/td/h1/text()').strip(),
"studio": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"year": re.findall('\d{4}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"outline": gcolle_crawler.getOutline('//*[@id="cart_quantity"]/table/tr[3]/td/p/text()'),
"runtime": '',
"director": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"actor": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"release": re.findall('\d{4}-\d{2}-\d{2}',gcolle_crawler.getString('//td[contains(text(),"商品登録日")]/../td[2]/time/@datetime'))[0],
"number": "GCOLLE-" + str(number_html),
"cover": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"thumb": "https:" + gcolle_crawler.getString('//*[@id="cart_quantity"]/table/tr[3]/td/table/tr/td/a/@href'),
"trailer": '',
"actor_photo":'',
"imagecut": 4, # 该值为4时同时也是有码影片 也用人脸识别裁剪封面
"tag": gcolle_crawler.getStrings('//*[@id="cart_quantity"]/table/tr[4]/td/a/text()'),
"extrafanart":extrafanart,
"label": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
"website": 'https://gcolle.net/product_info.php/products_id/' + number,
"source": 'gcolle.py',
"series": gcolle_crawler.getString('//td[contains(text(),"アップロード会員名")]/b/text()'),
'无码': False,
}
# for k,v in dic.items():
# if k == 'outline':
# print(k,len(v))
# else:
# print(k,v)
# print('===============================================================')
except Exception as e:
dic = {'title':''}
if config.getInstance().debug():
print(e)
return dic
if __name__ == '__main__':
from pprint import pprint
config.getInstance().set_override("debug_mode:switch=1")
pprint(main('840724'))
pprint(main('840386'))
pprint(main('838671'))
pprint(main('814179'))
pprint(main('834255'))
pprint(main('814179'))

View File

@@ -1,133 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.crawler import *
import re
import time
from urllib.parse import quote
JSON_HEADERS = {"Referer": "https://dl.getchu.com/"}
COOKIES_DL = {"adult_check_flag": "1"}
COOKIES_WWW = {'getchu_adalt_flag': 'getchu.com'}
GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_WWW_URL = 'http://www.getchu.com/soft.phtml?id=_WORD_'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
def get_dl_getchu(number):
if "item" in number or 'GETCHU' in number.upper():
number = re.findall('\d+',number)[0]
else:
htmlcode = get_html(GETCHU_DL_SEARCH_URL.replace("_WORD_", number),
json_headers=JSON_HEADERS, cookies=COOKIES_DL)
getchu = Crawler(htmlcode)
url = getchu.getString(
'/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
if url == "":
return None
number = re.findall('\d+', url)[0]
htmlcode = get_html(GETCHU_DL_URL.replace("_WORD_", number), json_headers=JSON_HEADERS, cookies=COOKIES_DL)
getchu = Crawler(htmlcode)
dic = {
"title": getchu.getString("//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"),
"cover": "https://dl.getchu.com" + getchu.getString("//td[contains(@bgcolor,'#ffffff')]/img/@src"),
"director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()").strip(),
"studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
"actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
"label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
"runtime": str(re.findall('\d+', str(getchu.getString(
"//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()")))).strip(" ['']"),
"release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-"),
"tag": getchu.getStrings("//td[contains(text(),'趣向')]/following-sibling::td/a/text()"),
"outline": getchu.getStrings("//*[contains(text(),'作品内容')]/following-sibling::td/text()"),
"extrafanart": getchu.getStrings("//td[contains(@style,'background-color: #444444;')]/a/@href"),
"series": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
"number": 'GETCHU-' + re.findall('\d+',number)[0],
"imagecut": 4,
"year": str(re.findall('\d{4}', str(getchu.getString(
"//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-")))).strip(" ['']"),
"actor_photo": "",
"website": "https://dl.getchu.com/i/" + number,
"source": "getchu.py",
"allow_number_change": True,
}
extrafanart = []
for i in dic['extrafanart']:
i = "https://dl.getchu.com" + i
extrafanart.append(i)
dic['extrafanart'] = extrafanart
time.sleep(1)
return dic
def get_www_getchu(number):
number = quote(number, encoding="euc_jp")
getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if url2 == '':
getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
if url2 == "":
return None
url2 = url2.replace('../', 'http://www.getchu.com/')
getchu = Crawler(get_html(url2, cookies=COOKIES_WWW))
dic = {
"title": getchu.getString('//*[@id="soft-title"]/text()').strip(),
"cover": "http://www.getchu.com" + getchu.getString(
"/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'),
"director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
"studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
"actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
"label": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(),
"runtime": '',
"release": getchu.getString("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-").strip(),
"tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"),
"outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"),
"extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"),
"series": getchu.getString("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()").strip(),
"number": 'GETCHU-' + re.findall('\d+', url2.replace("http://www.getchu.com/soft.phtml?id=", ""))[0],
"imagecut": 0,
"year": str(re.findall('\d{4}', str(getchu.getString(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"),
"actor_photo": "",
"website": url2,
"headers": {'referer': url2},
"source": "getchu.py",
"allow_number_change": True,
}
extrafanart = []
for i in dic['extrafanart']:
i = "http://www.getchu.com" + i.replace("./", '/')
if 'jpg' in i:
extrafanart.append(i)
dic['extrafanart'] = extrafanart
time.sleep(1)
return dic
def main(number):
number = number.replace("-C", "")
dic = {}
if "item" in number:
sort = ["get_dl_getchu(number)", "get_www_getchu(number)"]
else:
sort = ["get_www_getchu(number)", "get_dl_getchu(number)"]
for i in sort:
dic = eval(i)
if dic != None:
break
if dic == None:
return {"title" : ""}
outline = ''
_list = dic['outline']
for i in _list:
outline = outline + i
dic['outline'] = outline
result = json.dumps(dic,ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )
return result
if __name__ == '__main__':
test = []
for i in test:
print(i)
print(main(i))

View File

@@ -1,185 +0,0 @@
import sys
sys.path.append('../')
import json
from bs4 import BeautifulSoup
from lxml import html
from ADC_function import post_html
import re
def main(number: str) -> json:
try:
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
except:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
if "/video/" in result.url:
data = parse_info(soup)
dic = {
"title": get_title(lx),
"year": get_year(data),
"outline": get_outline(lx),
"director": "",
"cover": get_cover(lx),
"imagecut": 1,
"trailer": get_trailer(result.text),
"extrafanart": get_extrafanart(result.text),
"actor_photo": "",
"website": result.url,
"source": "jav321.py",
**data,
}
else:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
def parse_info(soup: BeautifulSoup) -> dict:
data = soup.select_one("div.row > div.col-md-9")
if data:
dd = str(data).split("<br/>")
data_dic = {}
for d in dd:
data_dic[get_bold_text(h=d)] = d
return {
"actor": get_actor(data_dic),
"label": get_label(data_dic),
"studio": get_studio(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic).upper(),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic).replace(" minutes", ""),
"series": get_series(data_dic),
}
else:
return {"title": ""}
def get_bold_text(h: str) -> str:
soup = BeautifulSoup(h, "html.parser")
if soup.b:
return soup.b.text
else:
return "UNKNOWN_TAG"
def get_anchor_info(h: str) -> str:
result = []
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
for d in data:
result.append(d.text)
return ",".join(result)
def get_text_info(h: str) -> str:
return h.split(": ")[1]
def get_trailer(html) -> str:
videourl_pather = re.compile(r'<source src=\"(.*?)\"')
videourl = videourl_pather.findall(html)
if videourl:
url = videourl[0].replace('awscc3001.r18.com', 'cc3001.dmm.co.jp').replace('cc3001.r18.com', 'cc3001.dmm.co.jp')
return url
else:
return ''
def get_extrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div class=\"col\-md\-3\"><div class=\"col\-xs\-12 col\-md\-12\">[\s\S]*?</script><script async src=\"\/\/adserver\.juicyads\.com/js/jads\.js\">')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def get_cover(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
def get_series2(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[1]/div[2]/a[11]/text()")[0]
def get_actor(data: hash) -> str:
if "出演者" in data:
return get_anchor_info(data["出演者"])
else:
return ""
def get_label(data: hash) -> str:
if "メーカー" in data:
return get_anchor_info(data["メーカー"])
else:
return ""
def get_tag(data: hash) -> str:
if "ジャンル" in data:
return get_anchor_info(data["ジャンル"])
else:
return ""
def get_studio(data: hash) -> str:
if "メーカー" in data:
return get_anchor_info(data["メーカー"])
else:
return ""
def get_number(data: hash) -> str:
if "品番" in data:
return get_text_info(data["品番"])
else:
return ""
def get_release(data: hash) -> str:
if "配信開始日" in data:
return get_text_info(data["配信開始日"])
else:
return ""
def get_runtime(data: hash) -> str:
if "収録時間" in data:
return get_text_info(data["収録時間"])
else:
return ""
def get_year(data: hash) -> str:
if "release" in data:
return data["release"][:4]
else:
return ""
def get_series(data: hash) -> str:
if "シリーズ" in data:
return get_anchor_info(data["シリーズ"])
else:
return ""
if __name__ == "__main__":
print(main("jul-404"))

View File

@@ -1,184 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.storyline import getStoryline
import inspect
def getActorPhoto(html):
actors = html.xpath('//div[@class="star-name"]/../a/img')
d = {}
for i in actors:
p = i.attrib['src']
if "nowprinting.gif" in p:
continue
t = i.attrib['title']
d[t] = urljoin("https://www.javbus.com", p)
return d
def getTitle(html): #获取标题
title = str(html.xpath('/html/head/title/text()')[0])
title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
return title
def getStudioJa(html):
x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getStudio(html): #获取厂商
x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getYear(html): #获取年份
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
return result[:4] if len(result)>=len('2000-01-01') else ''
def getCover(html): #获取封面链接
image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
return urljoin("https://www.javbus.com", image)
def getRelease(html): #获取出版日期
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
def getRuntime(html): #获取分钟 已修改
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result
def getActor(html): #获取女优
b=[]
actors = html.xpath('//div[@class="star-name"]/a')
for i in actors:
b.append(i.attrib['title'])
return b
def getNum(html): #获取番号
kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return kwdlist[0]
def getDirectorJa(html):
x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getDirector(html): #获取导演
x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string)
return result
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
if any(caller for caller in inspect.stack() if os.path.basename(caller.filename) == 'airav.py'):
return '' # 从airav.py过来的调用不计算outline直接返回避免重复抓取数据拖慢处理速度
return getStoryline(number,title, 无码=uncensored)
def getSeriseJa(html):
x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getSerise(html): #获取系列
x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
return str(x[0]) if len(x) else ''
def getTag(html): # 获取标签
klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return klist[1:]
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs]
return ''
def getUncensored(html):
x = html.xpath('//*[@id="navbar"]/ul[1]/li[@class="active"]/a[contains(@href,"uncensored")]')
return bool(x)
def main_uncensored(number):
w_number = number.replace('.', '-')
htmlcode = get_html('https://www.javbus.red/' + w_number)
if "<title>404 Page Not Found" in htmlcode:
return {"title": ""}
lx = etree.fromstring(htmlcode, etree.HTMLParser())
title = getTitle(lx)
dic = {
'title': title,
'studio': getStudioJa(lx),
'year': getYear(lx),
'outline': getOutline(w_number, title, True),
'runtime': getRuntime(lx),
'director': getDirectorJa(lx),
'actor': getActor(lx),
'release': getRelease(lx),
'number': getNum(lx),
'cover': getCover(lx),
'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
'label': getSeriseJa(lx),
'imagecut': 0,
'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.red/' + w_number,
'source': 'javbus.py',
'series': getSeriseJa(lx),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
def main(number):
try:
try:
url = "https://www." + secrets.choice([
'buscdn.fun', 'busdmm.fun', 'busfan.fun', 'busjav.fun',
'cdnbus.fun',
'dmmbus.fun', 'dmmsee.fun',
'fanbus.us',
'seedmm.fun',
]) + "/"
try:
htmlcode = get_html(url + number)
except:
htmlcode = get_html('https://www.javbus.com/' + number)
if "<title>404 Page Not Found" in htmlcode:
return {"title": ""}
lx = etree.fromstring(htmlcode,etree.HTMLParser())
title = getTitle(lx)
dic = {
'title': title,
'studio': getStudio(lx),
'year': getYear(lx),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'actor': getActor(lx),
'release': getRelease(lx),
'number': getNum(lx),
'cover': getCover(lx),
'imagecut': 1,
'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
'label': getSerise(lx),
'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
'series': getSerise(lx),
'无码': getUncensored(lx)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
return main_uncensored(number)
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == "__main__" :
config.getInstance().set_override("storyline:switch=0")
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
print(main('STAR-438'))
print(main('ABP-960'))
print(main('ADV-R0624')) # 404
print(main('MMNT-010'))
print(main('ipx-292'))
print(main('CEMD-011'))
print(main('CJOD-278'))
print(main('BrazzersExxtra.21.02.01'))
print(main('100221_001'))
print(main('AVSW-061'))

View File

@@ -1,321 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.storyline import getStoryline
def getTitle(html):
browser_title = str(html.xpath("/html/head/title/text()")[0])
return browser_title[:browser_title.find(' | JavDB')].strip()
def getActor(html):
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
r = []
idx = 0
actor_gendor = config.getInstance().actor_gender()
if not actor_gendor in ['female','male','both','all']:
actor_gendor = 'female'
for act in actors:
if((actor_gendor == 'all')
or (actor_gendor == 'both' and genders[idx] in ['symbol female', 'symbol male'])
or (actor_gendor == 'female' and genders[idx] == 'symbol female')
or (actor_gendor == 'male' and genders[idx] == 'symbol male')):
r.append(act)
idx = idx + 1
return r
def getaphoto(url, session):
html_page = session.get(url).text
img_url = re.findall(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)', html_page)
return img_url[0] if img_url else ''
def getActorPhoto(html, javdb_site, session):
actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
if not actorall:
return {}
a = getActor(html)
actor_photo = {}
if not session:
session = get_html_session()
for i in actorall:
x = re.findall(r'/actors/(.*)', i.attrib['href'], re.A)
if not len(x) or not len(x[0]) or i.text not in a:
continue
actor_id = x[0]
pic_url = f"https://c1.jdbstatic.com/avatars/{actor_id[:2].lower()}/{actor_id}.jpg"
if not session.head(pic_url).ok:
pic_url = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), session)
if len(pic_url):
actor_photo[i.text] = pic_url
return actor_photo
def getStudio(a, html):
patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>')
pianshang = patherr.findall(a)
if pianshang:
result = pianshang[0].strip()
if len(result):
return result
# 以卖家作为工作室
try:
result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
except:
result = ''
return result
def getRuntime(html):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getNum(html):
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+')
def getYear(getRelease):
patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)\-.*?</span>')
dates = patherr.findall(getRelease)
if dates:
result = dates[0]
else:
result = ''
return result
def getRelease(a):
patherr = re.compile(r'<strong>日期\:</strong>\s*?.*?<span class="value">(.*?)</span>')
dates = patherr.findall(a)
if dates:
result = dates[0]
else:
result = ''
return result
def getTag(html):
try:
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
return result
except:
result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
return result
def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
try:
result = html.xpath("//*[contains(@class,'movie-list')]/div/a/div[contains(@class, 'cover')]/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except: # 2020.7.17 Repair Cover Url crawl
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except:
result = html.xpath("//div[@class='item-image']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getTrailer(htmlcode): # 获取预告片
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
video = video_pather.findall(htmlcode)
# 加上数组判空
if video and video[0] != "":
if not 'https:' in video[0]:
video_url = 'https:' + video[0]
else:
video_url = video[0]
else:
video_url = ''
return video_url
def getExtrafanart(html): # 获取剧照
result = []
try:
result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href")
except:
pass
return result
def getCover(html):
try:
result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0]
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0]
return result
def getDirector(html):
result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getOutline(number, title, uncensored): #获取剧情介绍 多进程并发查询
return getStoryline(number, title, 无码=uncensored)
def getSeries(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
def getUserRating(html):
try:
result = str(html.xpath('//span[@class="score-stars"]/../text()')[0])
v = re.findall(r'(\d+|\d+\.\d+)分, 由(\d+)人評價', result)
return float(v[0][0]), int(v[0][1])
except:
return
def getUncensored(html):
x = html.xpath('//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?")'
' or contains(@href,"/tags/western?")]')
return bool(x)
def main(number):
# javdb更新后同一时间只能登录一个数字站最新登录站会踢出旧的登录因此按找到的第一个javdb*.json文件选择站点
# 如果无.json文件或者超过有效期则随机选择一个站点。
javdb_sites = config.getInstance().javdb_sites().split(',')
debug = config.getInstance().debug()
for i in javdb_sites:
javdb_sites[javdb_sites.index(i)] = "javdb" + i
javdb_sites.append("javdb")
try:
# if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group():
# pass
# else:
# number = number.upper()
number = number.upper()
javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'}
# 不加载过期的cookiejavdb登录界面显示为7天免登录故假定cookie有效期为7天
has_json = False
for cj in javdb_sites:
javdb_site = cj
cookie_json = javdb_site + '.json'
cookies_dict, cookies_filepath = load_cookies(cookie_json)
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
cdays = file_modification_days(cookies_filepath)
if cdays < 7:
javdb_cookies = cookies_dict
has_json = True
break
elif cdays != 9999:
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
if not has_json:
javdb_site = secrets.choice(javdb_sites)
if debug:
print(f'[!]javdb:select site {javdb_site}')
session = None
javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all'
try:
if debug:
raise # try get_html_by_scraper() branch
res, session = get_html_session(javdb_url, cookies=javdb_cookies, return_type='session')
if not res:
raise
query_result = res.text
except:
res, session = get_html_by_scraper(javdb_url, cookies=javdb_cookies, return_type='scraper')
if not res:
raise ValueError('page not found')
query_result = res.text
if session is None:
raise ValueError('page not found')
html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# javdb sometime returns multiple results,
# and the first elememt maybe not the one we are looking for
# iterate all candidates and find the match one
urls = html.xpath('//*[contains(@class,"movie-list")]/div/a/@href')
# 记录一下欧美的ids ['Blacked','Blacked']
if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
correct_url = urls[0]
else:
ids = html.xpath('//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
try:
correct_url = urls[ids.index(number)]
except:
# 为避免获得错误番号,只要精确对应的结果
if ids[0].upper() != number:
raise ValueError("number not found")
correct_url = urls[0]
try:
# get faster benefit from http keep-alive
javdb_detail_url = urljoin(res.url, correct_url)
detail_page = session.get(javdb_detail_url).text
except:
detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies)
session = None
# etree.fromstring开销很大最好只用一次而它的xpath很快比bs4 find/select快可以多用
lx = etree.fromstring(detail_page, etree.HTMLParser())
imagecut = 1
dp_number = getNum(lx)
if dp_number.upper() != number.upper():
raise ValueError("number not eq"+dp_number)
title = getTitle(lx)
if title and dp_number:
number = dp_number
# remove duplicate title
title = title.replace(number, '').strip()
dic = {
'actor': getActor(lx),
'title': title,
'studio': getStudio(detail_page, lx),
'outline': getOutline(number, title, getUncensored(lx)),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(detail_page),
'number': number,
'cover': getCover(lx),
'trailer': getTrailer(detail_page),
'extrafanart': getExtrafanart(lx),
'imagecut': imagecut,
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()),
'website': urljoin('https://javdb.com', correct_url),
'source': 'javdb.py',
'series': getSeries(lx),
'无码': getUncensored(lx)
}
userrating = getUserRating(lx)
if isinstance(userrating, tuple) and len(userrating) == 2:
dic['用户评分'] = userrating[0]
dic['评分人数'] = userrating[1]
if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A):
dic['actor'].append('素人')
if not dic['series']:
dic['series'] = dic['studio']
if not dic['label']:
dic['label'] = dic['studio']
if config.getInstance().download_actor_photo_for_kodi():
dic['actor_photo'] = getActorPhoto(lx, javdb_site, session)
except Exception as e:
if debug:
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
config.getInstance().set_override("storyline:switch=0")
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
# print(main('blacked.20.05.30'))
print(main('AGAV-042'))
print(main('BANK-022'))
print(main('070116-197'))
print(main('093021_539')) # 没有剧照 片商pacopacomama
#print(main('FC2-2278260'))
# print(main('FC2-735670'))
# print(main('FC2-1174949')) # not found
print(main('MVSD-439'))
# print(main('EHM0001')) # not found
#print(main('FC2-2314275'))
print(main('EBOD-646'))
print(main('LOVE-262'))
print(main('ABP-890'))
print(main('blacked.14.12.08'))

View File

@@ -1,161 +0,0 @@
import sys
sys.path.append('../')
import json
import bs4
import re
from WebCrawler import airav
from bs4 import BeautifulSoup
from lxml import html
from http.cookies import SimpleCookie
from ADC_function import get_javlib_cookie, get_html
def main(number: str):
raw_cookies, user_agent = get_javlib_cookie()
# Blank cookies mean javlib site return error
if not raw_cookies:
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
# Manually construct a dictionary
s_cookie = SimpleCookie()
s_cookie.load(raw_cookies)
cookies = {}
for key, morsel in s_cookie.items():
cookies[key] = morsel.value
# Scraping
result = get_html(
"http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
cookies=cookies,
ua=user_agent,
return_type="object"
)
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
fanhao_pather = re.compile(r'<a href=".*?".*?><div class="id">(.*?)</div>')
fanhao = fanhao_pather.findall(result.text)
if "/?v=jav" in result.url:
dic = {
"title": get_title(lx, soup),
"studio": get_table_el_single_anchor(soup, "video_maker"),
"year": get_table_el_td(soup, "video_date")[:4],
"outline": get_outline(number),
"director": get_table_el_single_anchor(soup, "video_director"),
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "javlib.py",
"actor": get_table_el_multi_anchor(soup, "video_cast"),
"label": get_table_el_td(soup, "video_label"),
"tag": get_table_el_multi_anchor(soup, "video_genres"),
"number": get_table_el_td(soup, "video_id"),
"release": get_table_el_td(soup, "video_date"),
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
"series":'',
}
elif number.upper() in fanhao:
url_pather = re.compile(r'<a href="(.*?)".*?><div class="id">(.*?)</div>')
s = {}
url_list = url_pather.findall(result.text)
for url in url_list:
s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.')
av_url = s[number.upper()]
result = get_html(
av_url,
cookies=cookies,
ua=user_agent,
return_type="object"
)
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
dic = {
"title": get_title(lx, soup),
"studio": get_table_el_single_anchor(soup, "video_maker"),
"year": get_table_el_td(soup, "video_date")[:4],
"outline": get_outline(number),
"director": get_table_el_single_anchor(soup, "video_director"),
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "javlib.py",
"actor": get_table_el_multi_anchor(soup, "video_cast"),
"label": get_table_el_td(soup, "video_label"),
"tag": get_table_el_multi_anchor(soup, "video_genres"),
"number": get_table_el_td(soup, "video_id"),
"release": get_table_el_td(soup, "video_date"),
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
"series": '',
}
else:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
return lx.xpath(xpath)[0].strip()
def get_outline(number):
try:
response = json.loads(airav.main(number))
result = response['outline']
return result
except:
return ''
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
tag = soup.find(id=tag_id).find("a")
if tag is not None:
return tag.string.strip()
else:
return ""
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
tags = soup.find(id=tag_id).find_all("a")
return process(tags)
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
tags = soup.find(id=tag_id).find_all("td", class_="text")
return process(tags)
def process(tags: bs4.element.ResultSet) -> str:
values = []
for tag in tags:
value = tag.string
if value is not None and value != "----":
values.append(value)
return ",".join(x for x in values if x)
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
number = get_table_el_td(soup, "video_id")
return title.replace(number, "").strip()
def get_cover(lx: html.HtmlComment) -> str:
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
if __name__ == "__main__":
lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"]
#lists = ["DVMC-003"]
for num in lists:
print(main(num))

View File

@@ -1,173 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
import json
import re
from lib2to3.pgen2 import parse
from urllib.parse import urlparse, unquote
def getActorPhoto(html):
return ''
def getTitle(html): # 获取标题
# <title>MD0140-2 / 家有性事EP2 爱在身边-麻豆社</title>
# <title>MAD039 机灵可爱小叫花 强诱僧人迫犯色戒-麻豆社</title>
# <title>MD0094贫嘴贱舌中出大嫂坏嫂嫂和小叔偷腥内射受孕-麻豆社</title>
# <title>TM0002-我的痴女女友-麻豆社</title>
browser_title = str(html.xpath("/html/head/title/text()")[0])
title = str(re.findall(r'^[A-Z0-9 /\-]*(.*)-麻豆社$', browser_title)[0]).strip()
return title
def getStudio(html): # 获取厂商 已修改
try:
category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
return category.strip()
except:
return '麻豆社'
def getYear(html): # 获取年份
return ''
def getCover(htmlcode): # 获取封面图片
try:
url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0])
return url.strip()
except:
return ''
def getRelease(html): # 获取出版日期
return ''
def getRuntime(html): # 获取播放时长
return ''
def getUrl(html):
return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
def getNum(url, number): # 获取番号
try:
# 解码url
filename = unquote(urlparse(url).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
# 移除中文
if result.upper() != number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
# 移除多余的符号
return result.strip('-')
except:
return ''
def getDirector(html): # 获取导演 已修改
return ''
def getOutline(html): # 获取概述
return ''
def getSerise(html): # 获取系列 已修改
return ''
def getTag(html, studio): # 获取标签
x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i]
def getExtrafanart(html): # 获取剧照
return ''
def cutTags(tags):
actors = []
tags = []
for tag in tags:
actors.append(tag)
return actors,tags
def main(number):
try:
try:
number = number.lower().strip()
url = "https://madou.club/" + number + ".html"
htmlcode = get_html(url)
except:
# print(number)
pass
html = etree.fromstring(htmlcode, etree.HTMLParser())
url = getUrl(html)
studio = getStudio(html)
tags = getTag(html, studio)
#actor,tags = cutTags(tags) # 演员在tags中的位置不固定放弃尝试获取
actor = ''
dic = {
# 标题
'title': getTitle(html),
# 制作商
'studio': studio,
# 年份
'year': getYear(html),
# 简介
'outline': getOutline(html),
#
'runtime': getRuntime(html),
# 导演
'director': getDirector(html),
# 演员
'actor': actor,
# 发售日
'release': getRelease(html),
# 番号
'number': getNum(url, number),
# 封面链接
'cover': getCover(htmlcode),
# 剧照获取
'extrafanart': getExtrafanart(html),
'imagecut': 1,
#
'tag': tags,
#
'label': getSerise(html),
# 作者图片
'website': url,
'source': 'madou.py',
# 使用
'series': getSerise(html),
'无码': True
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
config.getInstance().set_override("debug_mode:switch=1")
print(main('MD0129'))
# print(main('TM0002'))
# print(main('MD0222'))
# print(main('MD0140-2'))
# print(main('MAD039'))
# print(main('JDMY027'))

View File

@@ -1,68 +0,0 @@
import sys
sys.path.append('../')
from bs4 import BeautifulSoup
from ADC_function import *
from WebCrawler.crawler import *
class MgsCrawler(Crawler):
def getMgsString(self, _xpath):
html = self.html
result1 = str(html.xpath(_xpath)).strip(" ['']").strip('\\n ').strip('\\n').strip(" ['']").replace(u'\\n', '').replace("', '', '", '')
result2 = str(html.xpath(_xpath.replace('td/a/','td/'))).strip(" ['']").strip('\\n ').strip('\\n')
return str(result1 + result2).strip('+').replace("', '",'').replace('"','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result2 = str(html.xpath('//th[contains(text(),"ジャンル:")]/../td/text()')).strip(" ['']").strip('\\n ').strip('\\n')
result = str(result1 + result2).strip('+').replace("', '\\n",",").replace("', '","").replace('"','').replace(',,','').split(',')
return result
def getExtrafanart(htmlcode2): # 获取剧照
html_pather = re.compile(r'<dd>\s*?<ul>[\s\S]*?</ul>\s*?</dd>')
html = html_pather.search(htmlcode2)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a class=\"sample_image\" href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def main(number2):
number=number2.upper()
htmlcode2=str(get_html('https://www.mgstage.com/product/product_detail/'+str(number)+'/',cookies={'adc':'1'}))
soup = BeautifulSoup(htmlcode2, 'lxml')
a2 = str(soup.find(attrs={'class': 'detail_data'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
b2 = str(soup.find(attrs={'id': 'introduction'})).replace('\n ','').replace(' ','').replace('\n ','').replace('\n ','')
htmlcode = MgsCrawler(htmlcode2)
a = MgsCrawler(a2)
b = MgsCrawler(b2)
#print(b)
dic = {
'title': htmlcode.getString('//*[@id="center_column"]/div[1]/h1/text()').replace('/', ',').replace("\\n",'').replace(' ', '').strip(),
'studio': a.getMgsString('//th[contains(text(),"メーカー:")]/../td/a/text()'),
'outline': b.getString('//p/text()').strip(" ['']").replace(u'\\n', '').replace("', '', '", ''),
'runtime': a.getMgsString('//th[contains(text(),"収録時間:")]/../td/a/text()').rstrip('mi'),
'director': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
'actor': a.getMgsString('//th[contains(text(),"出演:")]/../td/a/text()'),
'release': a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()').replace('/','-'),
'number': a.getMgsString('//th[contains(text(),"品番:")]/../td/a/text()'),
'cover': htmlcode.getString('//*[@id="EnlargeImage"]/@href'),
'imagecut': 1,
'tag': getTag(a2),
'label': a.getMgsString('//th[contains(text(),"シリーズ:")]/../td/a/text()'),
'extrafanart': getExtrafanart(htmlcode2),
'year': str(re.findall('\d{4}',a.getMgsString('//th[contains(text(),"配信開始日:")]/../td/a/text()'))).strip(" ['']"),
# str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.mgstage.com/product/product_detail/' + str(number) + '/',
'source': 'mgstage.py',
'series': a.getMgsString('//th[contains(text(),"シリーズ")]/../td/a/text()'),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
print(main('SIRO-4149'))

View File

@@ -1,154 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
host = 'https://www.91mv.org'
def getActorPhoto(html):
return ''
def getTitle(html): #获取标题
try:
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
return result.strip()
except:
return ''
def getStudio(html): #获取厂商 已修改
return '91制片厂'
def getYear(html): #获取年份
try:
result = str(html.xpath('//p[@class="date"]/text()')[0])
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
return ''
return ''
def getCover(htmlcode): #获取封面图片
try:
url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
return url.strip()
except:
return ''
def getRelease(html): #获取出版日期
try:
result = str(html.xpath('//p[@class="date"]/text()')[0])
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
return ''
return ''
def getRuntime(htmlcode): #获取播放时长
return ''
def getActor(html): #获取女优
b=[]
for player in html.xpath('//p[@class="player-name"]/text()'):
player = player.replace('主演:','')
b.append(player)
return b
def getNum(html): #获取番号
try:
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
return result.strip()
except:
return ''
def getDirector(html): #获取导演 已修改
return ''
def getOutline(html): #获取概述
try:
result = str(html.xpath('//div[@class="play-text"]/text()')[0])
return result.strip()
except:
return ''
def getSerise(htmlcode): #获取系列 已修改
return ''
def getTag(html): # 获取标签
return html.xpath('//div[@class="player-tag"]/text()')
def getExtrafanart(htmlcode): # 获取剧照
return ''
def search(keyword): #搜索,返回结果
search_html = get_html(host + '/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
return html.xpath('//a[@class="video-list"]/@href')[0]
def main(number):
try:
try:
number = number.replace('91CM-','').replace('91MS-','')
url = host + str(search(number))
htmlcode = get_html(url)
except:
# print(number)
pass
html = etree.fromstring(htmlcode, etree.HTMLParser())
dic = {
# 标题
'title': getTitle(html),
# 制作商
'studio': getStudio(html),
# 年份
'year': getYear(html),
# 简介
'outline': getOutline(html),
#
'runtime': getRuntime(html),
# 导演
'director': getDirector(html),
# 演员
'actor': getActor(html),
# 发售日
'release': getRelease(html),
# 番号
'number': getNum(html),
# 封面链接
'cover': getCover(htmlcode),
# 剧照获取
'extrafanart': getExtrafanart(html),
'imagecut': 1,
#
'tag': getTag(html),
#
'label': getSerise(html),
# 作者图片
'website': url,
'source': 'mv91.py',
# 使用
'series': getSerise(html)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
print(main('91CM-121'))
print(main('91CM-122'))
print(main('91CM-143'))
print(main('91MS-006'))

View File

@@ -1,412 +0,0 @@
import sys
sys.path.append('../')
import builtins
from ADC_function import *
from lxml.html import fromstring
from multiprocessing.dummy import Pool as ThreadPool
from difflib import SequenceMatcher
from unicodedata import category
from number_parser import is_uncensored
G_registered_storyline_site = {"airavwiki", "airav", "avno1", "xcity", "amazon", "58avgo"}
G_mode_txt = ('顺序执行','线程池')
class noThread(object):
def map(self, fn, param):
return list(builtins.map(fn, param))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后
def getStoryline(number, title, sites: list=None, 无码=None):
start_time = time.time()
conf = config.getInstance()
if not conf.is_storyline():
return ''
debug = conf.debug() or conf.storyline_show() == 2
storyine_sites = conf.storyline_site().split(',') if sites is None else sites
unc = 无码 if isinstance(无码, bool) else is_uncensored(number)
if unc:
storyine_sites += conf.storyline_uncensored_site().split(',')
else:
storyine_sites += conf.storyline_censored_site().split(',')
r_dup = set()
sort_sites = []
for s in storyine_sites:
ns = re.sub(r'.*?:', '', s, re.A)
if ns in G_registered_storyline_site and ns not in r_dup:
sort_sites.append(s)
r_dup.add(ns)
sort_sites.sort()
apply_sites = [re.sub(r'.*?:', '', s, re.A) for s in sort_sites]
mp_args = ((site, number, title, debug) for site in apply_sites)
cores = min(len(apply_sites), os.cpu_count())
if cores == 0:
return ''
run_mode = 1 if conf.storyline_mode() > 0 else 0
with ThreadPool(cores) if run_mode > 0 else noThread() as pool:
results = pool.map(getStoryline_mp, mp_args)
sel = ''
if not debug and conf.storyline_show() == 0:
for value in results:
if isinstance(value, str) and len(value):
if not is_japanese(value):
return value
if not len(sel):
sel = value
return sel
# 以下debug结果输出会写入日志
s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{len(apply_sites)}个任务共耗时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}'
sel_site = ''
for site, desc in zip(apply_sites, results):
if isinstance(desc, str) and len(desc):
if not is_japanese(desc):
sel_site, sel = site, desc
break
if not len(sel_site):
sel_site, sel = site, desc
for site, desc in zip(apply_sites, results):
sl = len(desc) if isinstance(desc, str) else 0
s += f'[选中{site}字数:{sl}]' if site == sel_site else f'{site}字数:{sl}' if sl else f'{site}:空'
# print(s)
return sel
def getStoryline_mp(args):
(site, number, title, debug) = args
start_time = time.time()
storyline = None
if not isinstance(site, str):
return storyline
elif site == "airavwiki":
storyline = getStoryline_airavwiki(number, debug)
#storyline = getStoryline_airavwiki_super(number, debug)
elif site == "airav":
storyline = getStoryline_airav(number, debug)
elif site == "avno1":
storyline = getStoryline_avno1(number, debug)
elif site == "xcity":
storyline = getStoryline_xcity(number, debug)
elif site == "amazon":
storyline = getStoryline_amazon(title, number, debug)
elif site == "58avgo":
storyline = getStoryline_58avgo(number, debug)
if not debug:
return storyline
print("[!]MP 线程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format(
site,
time.time() - start_time,
time.strftime("%H:%M:%S"),
storyline if isinstance(storyline, str) and len(storyline) else '[空]')
)
return storyline
def getStoryline_airav(number, debug):
try:
site = secrets.choice(('airav.cc','airav4.club'))
url = f'https://{site}/searchresults.aspx?Search={number}&Type=0'
res, session = get_html_session(url, return_type='session')
if not res:
raise ValueError(f"get_html_by_session('{url}') failed")
lx = fromstring(res.text)
urls = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/@href')
txts = lx.xpath('//div[@class="resultcontent"]/ul/li/div/a[@class="ga_click"]/h3[@class="one_name ga_name"]/text()')
detail_url = None
for txt, url in zip(txts, urls):
if re.search(number, txt, re.I):
detail_url = urljoin(res.url, url)
break
if detail_url is None:
raise ValueError("number not found")
res = session.get(detail_url)
if not res.ok:
raise ValueError(f"session.get('{detail_url}') failed")
lx = fromstring(res.text)
t = str(lx.xpath('/html/head/title/text()')[0]).strip()
airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0])
if not re.search(number, airav_number, re.I):
raise ValueError(f"page number ->[{airav_number}] not match")
desc = str(lx.xpath('//span[@id="ContentPlaceHolder1_Label2"]/text()')[0]).strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP getStoryline_airav Error: {e},number [{number}].")
pass
return None
def getStoryline_airavwiki(number, debug):
try:
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
url = f'https://cn.airav.wiki/?search={kwd}'
result, browser = get_html_by_browser(url, return_type='browser', use_scraper=True)
if not result.ok:
raise ValueError(f"get_html_by_browser('{url}','{number}') failed")
s = browser.page.select('div.row > div > div.videoList.row > div > a.d-block')
link = None
for a in s:
title = a.img['title']
list_number = re.findall('^(.*?)\s+', title, re.A)[0].strip()
if kwd == number: # 番号PRED-164 和 RED-164需要能够区分
if re.match(f'^{number}$', list_number, re.I):
link = a
break
elif re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or not re.search(number, browser.url, re.I):
raise ValueError("detail page not found")
title = browser.page.select('head > title')[0].text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError(f"detail page number not match, got ->[{detail_number}]")
desc = browser.page.select_one('div.d-flex.videoDataBlock > div.synopsis > p').text.strip()
return desc
except Exception as e:
if debug:
print(f"[-]MP def getStoryline_airavwiki Error: {e}, number [{number}].")
pass
return ''
def getStoryline_58avgo(number, debug):
try:
url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
'', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
'?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
result, browser = get_html_by_form(url,
fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
return_type = 'browser')
if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
if f'searchresults.aspx?Search={kwd}' not in browser.url:
raise ValueError("number not found")
s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
link = None
for a in s:
title = a.h3.text.strip()
list_number = title[title.rfind(' ')+1:].strip()
if re.search(number, list_number, re.I):
link = a
break
if link is None:
raise ValueError("number not found")
result = browser.follow_link(link)
if not result.ok or 'playon.aspx' not in browser.url:
raise ValueError("detail page not found")
title = browser.page.select_one('head > title').text.strip()
detail_number = str(re.findall('\[(.*?)]', title)[0])
if not re.search(number, detail_number, re.I):
raise ValueError(f"detail page number not match, got ->[{detail_number}]")
return browser.page.select_one('#ContentPlaceHolder1_Label2').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
try:
site = secrets.choice(['1768av.club','2nine.net','av999.tv','avno1.cc',
'hotav.biz','iqq2.xyz','javhq.tv',
'www.hdsex.cc','www.porn18.cc','www.xxx18.cc',])
url = f'http://{site}/cn/search.php?kw_type=key&kw={number}'
lx = fromstring(get_html_by_scraper(url))
descs = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/@data-description')
titles = lx.xpath('//div[@class="type_movie"]/div/ul/li/div/a/h3/text()')
if not descs or not len(descs):
raise ValueError(f"number not found")
partial_num = bool(re.match(r'\d{6}[\-_]\d{2,3}', number))
for title, desc in zip(titles, descs):
page_number = title[title.rfind(' ')+1:].strip()
if not partial_num:
if re.match(f'^{number}$', page_number, re.I):
return desc.strip()
elif re.search(number, page_number, re.I):
return desc.strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_avno1OLD(number, debug): #获取剧情介绍 从avno1.cc取得
try:
url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
'?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
]) # 随机选一个避免网站httpd日志中单个ip的请求太过单一
result, browser = get_html_by_form(url,
form_select='div.wrapper > div.header > div.search > form',
fields = {'kw' : number},
return_type = 'browser')
if not result:
raise ValueError(f"get_html_by_form('{url}','{number}') failed")
s = browser.page.select('div.type_movie > div > ul > li > div')
for div in s:
title = div.a.h3.text.strip()
page_number = title[title.rfind(' ')+1:].strip()
if re.search(number, page_number, re.I):
return div['data-description'].strip()
raise ValueError(f"page number ->[{page_number}] not match")
except Exception as e:
if debug:
print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
pass
return ''
def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
try:
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("detail page not found")
return browser.page.select_one('h2.title-detail + p.lead').text.strip()
except Exception as e:
if debug:
print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
pass
return ''
def getStoryline_amazon(q_title, number, debug):
if not isinstance(q_title, str) or not len(q_title):
return None
try:
cookie, cookies_filepath = load_cookies('amazon.json')
url = "https://www.amazon.co.jp/s?k=" + q_title
res, session = get_html_session(url, cookies=cookie, return_type='session')
if not res:
raise ValueError("get_html_session() failed")
lx = fromstring(res.text)
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
if len(lks) and lks[0].startswith('/'):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
titles = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/text()")
urls = lx.xpath("//span[contains(@class,'a-size-base-plus a-color-base a-text-normal')]/../@href")
if not len(urls) or len(urls) != len(titles):
raise ValueError("titles not found")
idx = amazon_select_one(titles, q_title, number, debug)
if not isinstance(idx, int) or idx < 0:
raise ValueError("title and number not found")
furl = urljoin(res.url, urls[idx])
res = session.get(furl)
if not res.ok:
raise ValueError("browser.open_relative()) failed.")
lx = fromstring(res.text)
lks = lx.xpath('//a[contains(@href, "/black-curtain/save-eligibility/black-curtain")]/@href')
if len(lks) and lks[0].startswith('/'):
res = session.get(urljoin(res.url, lks[0]))
cookie = None
lx = fromstring(res.text)
p1 = lx.xpath('//*[@id="productDescription"]/p[1]/span/text()')
p2 = lx.xpath('//*[@id="productDescription"]/p[2]/span/text()')
ama_t = ' '.join(p1) + ' '.join(p2)
ama_t = re.sub(r'審査番号:\d+', '', ama_t).strip()
if cookie is None:
# 删除无效cookies无论是用户创建还是自动创建以避免持续故障
cookies_filepath and len(cookies_filepath) and Path(cookies_filepath).is_file() and Path(cookies_filepath).unlink(missing_ok=True)
# 自动创建的cookies文件放在搜索路径表的末端最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
ama_save = Path.home() / ".local/share/mdc/amazon.json"
ama_save.parent.mkdir(parents=True, exist_ok=True)
ama_save.write_text(json.dumps(session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
return ama_t
except Exception as e:
if debug:
print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
pass
return None
# 查货架中DVD和蓝光商品中标题相似度高的
def amazon_select_one(a_titles, q_title, number, debug):
sel = -1
ratio = 0
que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
for tloc, title in enumerate(a_titles):
if re.search(number, title, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
return tloc
if not re.search('DVD|Blu-ray', title, re.I):
continue
ama_t = str(re.sub('DVD|Blu-ray', "", title, re.I))
ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
findlen = 0
lastpos = -1
for cloc, char in reversed(tuple(enumerate(ama_t))):
pos = que_t.rfind(char)
if lastpos >= 0:
pos_near = que_t[:lastpos].rfind(char)
if pos_near < 0:
findlen = 0
lastpos = -1
ama_t = ama_t[:cloc+1]
else:
pos = pos_near
if pos < 0:
if category(char) == 'Nd':
return -1
if re.match(r'[\u4e00|\u4e8c|\u4e09|\u56db|\u4e94|\u516d|\u4e03|\u516b|\u4e5d|\u5341]', char, re.U):
return -1
ama_t = ama_t[:cloc]
findlen = 0
lastpos = -1
continue
if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
findlen += 1
lastpos = pos
if findlen >= 4:
break
continue
findlen = 1
lastpos = pos
if findlen==0:
return -1
r = SequenceMatcher(None, ama_t, que_t).ratio()
if r > ratio:
sel = tloc
ratio = r
save_t_ = ama_t
if ratio > 0.999:
break
if ratio < 0.5:
return -1
if not debug:
# 目前采信相似度高于0.9的结果
return sel if ratio >= 0.9 else -1
# debug 模式下记录识别准确率日志
if ratio < 0.9:
# 相似度[0.5, 0.9)的淘汰结果单独记录日志
with (Path.home() / '.mlogs/ratio0.5.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return -1
# 被采信的结果日志
with (Path.home() / '.mlogs/ratio.txt').open('a', encoding='utf-8') as hrt:
hrt.write(f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
return sel

View File

@@ -1,220 +0,0 @@
import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.storyline import getStoryline
def getTitle(html):
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
return result
def getActor(browser):
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = []
for i in htmla:
t.append(i.text.strip())
return t
def getActorPhoto(browser):
htmla = browser.page.select('#avodDetails > div > div.frame > div.content > div > ul.profileCL > li.credit-links > a')
t = {i.text.strip(): i['href'] for i in htmla}
o = {}
for k, v in t.items():
r = browser.open_relative(v)
if not r.ok:
continue
pic = browser.page.select_one('#avidolDetails > div > div.frame > div > p > img')
if 'noimage.gif' in pic['src']:
continue
o[k] = urljoin(browser.url, pic['src'])
return o
def getStudio(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
except:
result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
return result.strip('+').replace("', '", '').replace('"', '')
def getRuntime(html):
try:
x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
return x
except:
return ''
def getLabel(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
return result
except:
return ''
def getNum(html):
try:
result = html.xpath('//*[@id="hinban"]/text()')[0]
return result
except:
return ''
def getYear(getRelease):
try:
result = str(re.search('\d{4}', getRelease).group())
return result
except:
return getRelease
def getRelease(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
except:
return ''
try:
return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
except:
return ''
def getTag(html):
result = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
total = []
for i in result:
total.append(i.replace("\n","").replace("\t",""))
return total
def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
return 'https:' + result
except:
return ''
def getDirector(html):
try:
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
return result
except:
return ''
def getOutline(html, number, title):
storyline_site = config.getInstance().storyline_site().split(',')
a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
if len(a):
site = [n for n in storyline_site if n in a]
g = getStoryline(number, title, site, 无码=False)
if len(g):
return g
try:
x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
return x.replace(getNum(html), '')
except:
return ''
def getSeries(html):
try:
try:
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
return result
except:
result = html.xpath("//span[contains(text(),'シリーズ')]/../span/text()")[0]
return result
except:
return ''
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div id="sample_images".*?>[\s\S]*?</div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<a.*?href=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
s = []
for urli in extrafanart_imgs:
urli = 'https:' + urli.replace('/scene/small', '')
s.append(urli)
return s
return ''
def open_by_browser(number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
raise ValueError("xcity.py: page not found")
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
return str(browser.page), browser
def main(number):
try:
detail_page, browser = open_by_browser(number)
url = browser.url
lx = etree.fromstring(detail_page, etree.HTMLParser())
newnum = getNum(lx).upper()
number_up = number.upper()
if newnum != number_up:
if newnum == number.replace('-','').upper():
newnum = number_up
else:
raise ValueError("xcity.py: number not found")
title = getTitle(lx)
dic = {
'actor': getActor(browser),
'title': title,
'studio': getStudio(lx),
'outline': getOutline(lx, number, title),
'runtime': getRuntime(lx),
'director': getDirector(lx),
'release': getRelease(lx),
'number': newnum,
'cover': getCover(lx),
'cover_small': '',
'extrafanart': getExtrafanart(detail_page),
'imagecut': 1,
'tag': getTag(lx),
'label': getLabel(lx),
'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()),
'website': url,
'source': 'xcity.py',
'series': getSeries(lx),
}
if config.getInstance().download_actor_photo_for_kodi():
dic['actor_photo'] = getActorPhoto(browser)
except Exception as e:
if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
if __name__ == '__main__':
config.getInstance().set_override("storyline:switch=0")
config.getInstance().set_override("actor_photo:download_for_kodi=1")
config.getInstance().set_override("debug_mode:switch=1")
print(main('RCTD-288'))
print(main('VNDS-2624'))
print(main('ABP-345'))