支持91制片室和麻豆,优化图片裁剪功能添加了人脸识别模块

This commit is contained in:
hejianjun
2022-01-30 03:37:08 +08:00
parent 9a9d36672f
commit a84452ba1c
8 changed files with 449 additions and 45 deletions

View File

@@ -22,6 +22,8 @@ from . import xcity
from . import dlsite
from . import carib
from . import fc2club
from . import mv91
from . import madou
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
@@ -57,7 +59,9 @@ def get_data_from_json(file_number, oCC): # 从JSON返回元数据
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main
"fc2club": fc2club.main,
"mv91": mv91.main,
"madou": madou.main
}
conf = config.getInstance()

View File

@@ -1,3 +1,4 @@
import logging
import sys
sys.path.append('../')
import re
@@ -139,7 +140,8 @@ def getCover_small(html, index=0):
def getTrailer(htmlcode): # 获取预告片
video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
video = video_pather.findall(htmlcode)
if video[0] != "":
# 加上数组判空
if video and video[0] != "":
if not 'https:' in video[0]:
video_url = 'https:' + video[0]
else:
@@ -263,16 +265,14 @@ def main(number):
# replace wit normal cover and cut it
imagecut = 1
cover_small = getCover(lx)
dp_number = getNum(lx)
if dp_number.upper() != number:
raise ValueError("number not found")
if dp_number.upper() != number.upper():
raise ValueError("number not eq"+dp_number)
title = getTitle(lx)
if title and dp_number:
number = dp_number
# remove duplicate title
title = title.replace(number, '').strip()
dic = {
'actor': getActor(lx),
'title': title,

164
WebCrawler/madou.py Normal file
View File

@@ -0,0 +1,164 @@
from bs4 import BeautifulSoup # need install
from lxml import etree # need install
from pyquery import PyQuery as pq # need install
from ADC_function import *
import json
import re
from lib2to3.pgen2 import parse
import sys
from urllib.parse import urlparse, unquote
sys.path.append('../')
def getActorPhoto(html):
return ''
def getTitle(html, number): # 获取标题
title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
try:
result = str(re.split(r'[/||-]', title)[1])
return result.strip()
except:
return title.replace(number.upper(), '').strip()
def getStudio(html): # 获取厂商 已修改
try:
category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
return category.strip()
except:
return '麻豆社'
def getYear(html): # 获取年份
return ''
def getCover(htmlcode): # 获取封面图片
try:
url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0])
return url.strip()
except:
return ''
def getRelease(html): # 获取出版日期
return ''
def getRuntime(html): # 获取播放时长
return ''
def getActor(html): # 获取女优
b = []
for player in html.xpath('//div[@class="article-tags"]/a/text()'):
b.append(player)
return b
def getUrl(html):
return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
def getNum(url, number): # 获取番号
try:
# 解码url
filename = unquote(urlparse(url).path)
# 裁剪文件名
result = filename[1:-5].upper().strip()
print(result)
# 移除中文
if result.upper() != number.upper():
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
# 移除多余的符号
return result.strip('-')
except:
return ''
def getDirector(html): # 获取导演 已修改
return ''
def getOutline(html): # 获取概述
return ''
def getSerise(html): # 获取系列 已修改
return ''
def getTag(html): # 获取标签
return html.xpath('//div[@class="article-tags"]/a/text()')
def getExtrafanart(html): # 获取剧照
return ''
def main(number):
try:
try:
number = number.lower()
url = "https://madou.club/" + number + ".html"
htmlcode = get_html(url)
except:
print(number)
html = etree.fromstring(htmlcode, etree.HTMLParser())
url = getUrl(html)
dic = {
# 标题
'title': getTitle(html, number),
# 制作商
'studio': getStudio(html),
# 年份
'year': getYear(html),
# 简介
'outline': getOutline(html),
#
'runtime': getRuntime(html),
# 导演
'director': getDirector(html),
# 演员
'actor': getActor(html),
# 发售日
'release': getRelease(html),
# 番号
'number': getNum(url, number),
# 封面链接
'cover': getCover(htmlcode),
# 剧照获取
'extrafanart': getExtrafanart(html),
'imagecut': 1,
#
'tag': getTag(html),
#
'label': getSerise(html),
# 作者图片
'website': url,
'source': 'madou.py',
# 使用
'series': getSerise(html)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
print(main('MD-0147'))
print(main('MD0147'))

158
WebCrawler/mv91.py Normal file
View File

@@ -0,0 +1,158 @@
import sys
sys.path.append('../')
import re
from pyquery import PyQuery as pq#need install
from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
host = 'https://www.91mv.org'
def getActorPhoto(html):
return ''
def getTitle(html): #获取标题
try:
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
return result.strip()
except:
return ''
def getStudio(html): #获取厂商 已修改
return '91制片厂'
def getYear(html): #获取年份
try:
result = str(html.xpath('//p[@class="date"]/text()')[0])
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
return ''
return ''
def getCover(htmlcode): #获取封面图片
try:
url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
return url.strip()
except:
return ''
def getRelease(html): #获取出版日期
try:
result = str(html.xpath('//p[@class="date"]/text()')[0])
date = result.replace('日期:','')
if isinstance(date, str) and len(date):
return date
except:
return ''
return ''
def getRuntime(htmlcode): #获取播放时长
return ''
def getActor(html): #获取女优
b=[]
for player in html.xpath('//p[@class="player-name"]/text()'):
player = player.replace('主演:','')
b.append(player)
return b
def getNum(html): #获取番号
try:
title = str(html.xpath('//div[@class="player-title"]/text()')[0])
result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
return result.strip()
except:
return ''
def getDirector(html): #获取导演 已修改
return ''
def getOutline(html): #获取概述
try:
result = str(html.xpath('//div[@class="play-text"]/text()')[0])
return result.strip()
except:
return ''
def getSerise(htmlcode): #获取系列 已修改
return ''
def getTag(html): # 获取标签
return html.xpath('//div[@class="player-tag"]/text()')
def getExtrafanart(htmlcode): # 获取剧照
return ''
def search(keyword): #搜索,返回结果
search_html = get_html(host + '/index/search?keywords=' + keyword)
html = etree.fromstring(search_html, etree.HTMLParser())
return html.xpath('//a[@class="video-list"]/@href')[0]
def main(number):
try:
try:
number = number.replace('91CM-','').replace('91MS-','')
url = host + str(search(number))
htmlcode = get_html(url)
except:
print(number)
html = etree.fromstring(htmlcode, etree.HTMLParser())
dic = {
# 标题
'title': getTitle(html),
# 制作商
'studio': getStudio(html),
# 年份
'year': getYear(html),
# 简介
'outline': getOutline(html),
#
'runtime': getRuntime(html),
# 导演
'director': getDirector(html),
# 演员
'actor': getActor(html),
# 发售日
'release': getRelease(html),
# 番号
'number': getNum(html),
# 封面链接
'cover': getCover(htmlcode),
# 剧照获取
'extrafanart': getExtrafanart(html),
'imagecut': 1,
#
'tag': getTag(html),
#
'label': getSerise(html),
# 作者图片
'website': url,
'source': 'mv91.py',
# 使用
'series': getSerise(html)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
print(main('91CM-121'))
print(main('91CM-122'))
print(main('91CM-143'))
print(main('91MS-006'))