Merge branch 'yoshiko2_master'
# Conflicts: # WebCrawler/javdb.py
This commit is contained in:
@@ -22,6 +22,8 @@ from . import xcity
|
||||
from . import dlsite
|
||||
from . import carib
|
||||
from . import fc2club
|
||||
from . import mv91
|
||||
from . import madou
|
||||
|
||||
|
||||
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
||||
@@ -36,9 +38,10 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
|
||||
|
||||
return True
|
||||
|
||||
def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
|
||||
def get_data_from_json(file_number, oCC):
|
||||
"""
|
||||
iterate through all services and fetch the data
|
||||
iterate through all services and fetch the data 从JSON返回元数据
|
||||
"""
|
||||
|
||||
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
|
||||
@@ -57,13 +60,15 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
# "javlib": javlib.main,
|
||||
"dlsite": dlsite.main,
|
||||
"carib": carib.main,
|
||||
"fc2club": fc2club.main
|
||||
"fc2club": fc2club.main,
|
||||
"mv91": mv91.main,
|
||||
"madou": madou.main
|
||||
}
|
||||
|
||||
conf = config.getInstance()
|
||||
# default fetch order list, from the beginning to the end
|
||||
sources = conf.sources().split(',')
|
||||
if not len(conf.sources()) > 80:
|
||||
if len(sources) <= len(func_mapping):
|
||||
# if the input file name matches certain rules,
|
||||
# move some web service to the beginning of the list
|
||||
lo_file_number = file_number.lower()
|
||||
@@ -231,8 +236,8 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
json_data['studio'] = studio
|
||||
json_data['director'] = director
|
||||
|
||||
if conf.is_transalte():
|
||||
translate_values = conf.transalte_values().split(",")
|
||||
if conf.is_translate():
|
||||
translate_values = conf.translate_values().split(",")
|
||||
for translate_value in translate_values:
|
||||
if json_data[translate_value] == "":
|
||||
continue
|
||||
@@ -244,12 +249,12 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
if conf.get_transalte_engine() == "azure":
|
||||
if conf.get_translate_engine() == "azure":
|
||||
t = translate(
|
||||
json_data[translate_value],
|
||||
target_language="zh-Hans",
|
||||
engine=conf.get_transalte_engine(),
|
||||
key=conf.get_transalte_key(),
|
||||
engine=conf.get_translate_engine(),
|
||||
key=conf.get_translate_key(),
|
||||
)
|
||||
else:
|
||||
t = translate(json_data[translate_value])
|
||||
@@ -270,7 +275,7 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
|
||||
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
|
||||
else:
|
||||
return vars
|
||||
raise IndexError('keyword not found')
|
||||
for cc in cc_vars:
|
||||
if json_data[cc] == "" or len(json_data[cc]) == 0:
|
||||
continue
|
||||
@@ -298,20 +303,20 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
|
||||
elif ccm == 3:
|
||||
json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
|
||||
except:
|
||||
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
|
||||
else:
|
||||
try:
|
||||
if ccm == 1:
|
||||
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
|
||||
elif ccm == 2:
|
||||
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
|
||||
elif ccm == 3:
|
||||
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
|
||||
json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
|
||||
except IndexError:
|
||||
json_data[cc] = oCC.convert(json_data[cc])
|
||||
except:
|
||||
@@ -322,11 +327,13 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
|
||||
if i not in json_data:
|
||||
naming_rule += i.strip("'").strip('"')
|
||||
else:
|
||||
naming_rule += json_data.get(i)
|
||||
item = json_data.get(i)
|
||||
naming_rule += item if type(item) is not list else "&".join(item)
|
||||
|
||||
json_data['naming_rule'] = naming_rule
|
||||
return json_data
|
||||
|
||||
|
||||
def special_characters_replacement(text) -> str:
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
|
||||
@@ -139,6 +139,7 @@ def getCover_small(html, index=0):
|
||||
def getTrailer(htmlcode): # 获取预告片
|
||||
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
|
||||
video = video_pather.findall(htmlcode)
|
||||
# 加上数组判空
|
||||
if video and video[0] != "":
|
||||
if not 'https:' in video[0]:
|
||||
video_url = 'https:' + video[0]
|
||||
@@ -263,16 +264,14 @@ def main(number):
|
||||
# replace wit normal cover and cut it
|
||||
imagecut = 1
|
||||
cover_small = getCover(lx)
|
||||
|
||||
dp_number = getNum(lx)
|
||||
if dp_number.upper() != number:
|
||||
raise ValueError("number not found")
|
||||
if dp_number.upper() != number.upper():
|
||||
raise ValueError("number not eq"+dp_number)
|
||||
title = getTitle(lx)
|
||||
if title and dp_number:
|
||||
number = dp_number
|
||||
# remove duplicate title
|
||||
title = title.replace(number, '').strip()
|
||||
|
||||
dic = {
|
||||
'actor': getActor(lx),
|
||||
'title': title,
|
||||
@@ -325,7 +324,7 @@ if __name__ == "__main__":
|
||||
# print(main('FC2-1174949')) # not found
|
||||
#print(main('MVSD-439'))
|
||||
# print(main('EHM0001')) # not found
|
||||
print(main('032517_505'))
|
||||
print(main('FC2-2314275'))
|
||||
# print(main('EBOD-646'))
|
||||
# print(main('LOVE-262'))
|
||||
#print(main('ABP-890'))
|
||||
|
||||
164
WebCrawler/madou.py
Normal file
164
WebCrawler/madou.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from bs4 import BeautifulSoup # need install
|
||||
from lxml import etree # need install
|
||||
from pyquery import PyQuery as pq # need install
|
||||
from ADC_function import *
|
||||
import json
|
||||
import re
|
||||
from lib2to3.pgen2 import parse
|
||||
import sys
|
||||
|
||||
from urllib.parse import urlparse, unquote
|
||||
sys.path.append('../')
|
||||
|
||||
|
||||
def getActorPhoto(html):
|
||||
return ''
|
||||
|
||||
|
||||
def getTitle(html, number): # 获取标题
|
||||
title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
|
||||
try:
|
||||
result = str(re.split(r'[/|/|-]', title)[1])
|
||||
return result.strip()
|
||||
except:
|
||||
return title.replace(number.upper(), '').strip()
|
||||
|
||||
|
||||
def getStudio(html): # 获取厂商 已修改
|
||||
try:
|
||||
category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
|
||||
return category.strip()
|
||||
except:
|
||||
return '麻豆社'
|
||||
|
||||
|
||||
def getYear(html): # 获取年份
|
||||
return ''
|
||||
|
||||
|
||||
def getCover(htmlcode): # 获取封面图片
|
||||
try:
|
||||
url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0])
|
||||
return url.strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getRelease(html): # 获取出版日期
|
||||
return ''
|
||||
|
||||
|
||||
def getRuntime(html): # 获取播放时长
|
||||
return ''
|
||||
|
||||
def getUrl(html):
|
||||
return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
|
||||
|
||||
|
||||
def getNum(url, number): # 获取番号
|
||||
try:
|
||||
# 解码url
|
||||
filename = unquote(urlparse(url).path)
|
||||
# 裁剪文件名
|
||||
result = filename[1:-5].upper().strip()
|
||||
print(result)
|
||||
# 移除中文
|
||||
if result.upper() != number.upper():
|
||||
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
|
||||
# 移除多余的符号
|
||||
return result.strip('-')
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getDirector(html): # 获取导演 已修改
|
||||
return ''
|
||||
|
||||
|
||||
def getOutline(html): # 获取概述
|
||||
return ''
|
||||
|
||||
|
||||
def getSerise(html): # 获取系列 已修改
|
||||
return ''
|
||||
|
||||
|
||||
def getTag(html): # 获取标签
|
||||
return html.xpath('//div[@class="article-tags"]/a/text()')
|
||||
|
||||
|
||||
def getExtrafanart(html): # 获取剧照
|
||||
return ''
|
||||
|
||||
def cutTags(tags):
|
||||
actors = []
|
||||
tags = []
|
||||
for tag in tags:
|
||||
actors.append(tag)
|
||||
return actors,tags
|
||||
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
try:
|
||||
number = number.lower().strip()
|
||||
url = "https://madou.club/" + number + ".html"
|
||||
htmlcode = get_html(url)
|
||||
except:
|
||||
print(number)
|
||||
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
url = getUrl(html)
|
||||
tags = getTag(html)
|
||||
actor,tags = cutTags(tags);
|
||||
dic = {
|
||||
# 标题
|
||||
'title': getTitle(html, number),
|
||||
# 制作商
|
||||
'studio': getStudio(html),
|
||||
# 年份
|
||||
'year': getYear(html),
|
||||
# 简介
|
||||
'outline': getOutline(html),
|
||||
#
|
||||
'runtime': getRuntime(html),
|
||||
# 导演
|
||||
'director': getDirector(html),
|
||||
# 演员
|
||||
'actor': actor,
|
||||
# 发售日
|
||||
'release': getRelease(html),
|
||||
# 番号
|
||||
'number': getNum(url, number),
|
||||
# 封面链接
|
||||
'cover': getCover(htmlcode),
|
||||
# 剧照获取
|
||||
'extrafanart': getExtrafanart(html),
|
||||
'imagecut': 1,
|
||||
#
|
||||
'tag': tags,
|
||||
#
|
||||
'label': getSerise(html),
|
||||
# 作者图片
|
||||
'website': url,
|
||||
'source': 'madou.py',
|
||||
# 使用
|
||||
'series': getSerise(html)
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
|
||||
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
except Exception as e:
|
||||
if config.getInstance().debug():
|
||||
print(e)
|
||||
data = {
|
||||
"title": "",
|
||||
}
|
||||
js = json.dumps(
|
||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||
)
|
||||
return js
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('MD0094'))
|
||||
158
WebCrawler/mv91.py
Normal file
158
WebCrawler/mv91.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import sys
|
||||
sys.path.append('../')
|
||||
import re
|
||||
from pyquery import PyQuery as pq#need install
|
||||
from lxml import etree#need install
|
||||
from bs4 import BeautifulSoup#need install
|
||||
import json
|
||||
from ADC_function import *
|
||||
|
||||
|
||||
host = 'https://www.91mv.org'
|
||||
|
||||
def getActorPhoto(html):
|
||||
return ''
|
||||
|
||||
def getTitle(html): #获取标题
|
||||
try:
|
||||
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
|
||||
result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
|
||||
return result.strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getStudio(html): #获取厂商 已修改
|
||||
return '91制片厂'
|
||||
|
||||
def getYear(html): #获取年份
|
||||
try:
|
||||
result = str(html.xpath('//p[@class="date"]/text()')[0])
|
||||
date = result.replace('日期:','')
|
||||
if isinstance(date, str) and len(date):
|
||||
return date
|
||||
except:
|
||||
return ''
|
||||
return ''
|
||||
|
||||
def getCover(htmlcode): #获取封面图片
|
||||
try:
|
||||
url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
|
||||
return url.strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getRelease(html): #获取出版日期
|
||||
try:
|
||||
result = str(html.xpath('//p[@class="date"]/text()')[0])
|
||||
date = result.replace('日期:','')
|
||||
if isinstance(date, str) and len(date):
|
||||
return date
|
||||
except:
|
||||
return ''
|
||||
return ''
|
||||
|
||||
def getRuntime(htmlcode): #获取播放时长
|
||||
return ''
|
||||
|
||||
def getActor(html): #获取女优
|
||||
b=[]
|
||||
for player in html.xpath('//p[@class="player-name"]/text()'):
|
||||
player = player.replace('主演:','')
|
||||
b.append(player)
|
||||
return b
|
||||
|
||||
def getNum(html): #获取番号
|
||||
try:
|
||||
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
|
||||
result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
|
||||
return result.strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
def getDirector(html): #获取导演 已修改
|
||||
return ''
|
||||
|
||||
def getOutline(html): #获取概述
|
||||
try:
|
||||
result = str(html.xpath('//div[@class="play-text"]/text()')[0])
|
||||
return result.strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getSerise(htmlcode): #获取系列 已修改
|
||||
return ''
|
||||
|
||||
def getTag(html): # 获取标签
|
||||
return html.xpath('//div[@class="player-tag"]/text()')
|
||||
|
||||
def getExtrafanart(htmlcode): # 获取剧照
|
||||
return ''
|
||||
|
||||
def search(keyword): #搜索,返回结果
|
||||
search_html = get_html(host + '/index/search?keywords=' + keyword)
|
||||
html = etree.fromstring(search_html, etree.HTMLParser())
|
||||
return html.xpath('//a[@class="video-list"]/@href')[0]
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
try:
|
||||
number = number.replace('91CM-','').replace('91MS-','')
|
||||
url = host + str(search(number))
|
||||
htmlcode = get_html(url)
|
||||
except:
|
||||
print(number)
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
dic = {
|
||||
# 标题
|
||||
'title': getTitle(html),
|
||||
# 制作商
|
||||
'studio': getStudio(html),
|
||||
# 年份
|
||||
'year': getYear(html),
|
||||
# 简介
|
||||
'outline': getOutline(html),
|
||||
#
|
||||
'runtime': getRuntime(html),
|
||||
# 导演
|
||||
'director': getDirector(html),
|
||||
# 演员
|
||||
'actor': getActor(html),
|
||||
# 发售日
|
||||
'release': getRelease(html),
|
||||
# 番号
|
||||
'number': getNum(html),
|
||||
# 封面链接
|
||||
'cover': getCover(htmlcode),
|
||||
# 剧照获取
|
||||
'extrafanart': getExtrafanart(html),
|
||||
'imagecut': 1,
|
||||
#
|
||||
'tag': getTag(html),
|
||||
#
|
||||
'label': getSerise(html),
|
||||
# 作者图片
|
||||
'website': url,
|
||||
'source': 'mv91.py',
|
||||
# 使用
|
||||
'series': getSerise(html)
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
except Exception as e:
|
||||
if config.getInstance().debug():
|
||||
print(e)
|
||||
data = {
|
||||
"title": "",
|
||||
}
|
||||
js = json.dumps(
|
||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||
)
|
||||
return js
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('91CM-121'))
|
||||
print(main('91CM-122'))
|
||||
print(main('91CM-143'))
|
||||
print(main('91MS-006'))
|
||||
Reference in New Issue
Block a user