支持91制片室和麻豆,优化图片裁剪功能添加了人脸识别模块
This commit is contained in:
164
WebCrawler/madou.py
Normal file
164
WebCrawler/madou.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from bs4 import BeautifulSoup # need install
|
||||
from lxml import etree # need install
|
||||
from pyquery import PyQuery as pq # need install
|
||||
from ADC_function import *
|
||||
import json
|
||||
import re
|
||||
from lib2to3.pgen2 import parse
|
||||
import sys
|
||||
|
||||
from urllib.parse import urlparse, unquote
|
||||
sys.path.append('../')
|
||||
|
||||
|
||||
def getActorPhoto(html):
|
||||
return ''
|
||||
|
||||
|
||||
def getTitle(html, number): # 获取标题
|
||||
title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
|
||||
try:
|
||||
result = str(re.split(r'[/|/|-]', title)[1])
|
||||
return result.strip()
|
||||
except:
|
||||
return title.replace(number.upper(), '').strip()
|
||||
|
||||
|
||||
def getStudio(html): # 获取厂商 已修改
|
||||
try:
|
||||
category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
|
||||
return category.strip()
|
||||
except:
|
||||
return '麻豆社'
|
||||
|
||||
|
||||
def getYear(html): # 获取年份
|
||||
return ''
|
||||
|
||||
|
||||
def getCover(htmlcode): # 获取封面图片
|
||||
try:
|
||||
url = str(re.findall("shareimage : '(.*?)'", htmlcode)[0])
|
||||
return url.strip()
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getRelease(html): # 获取出版日期
|
||||
return ''
|
||||
|
||||
|
||||
def getRuntime(html): # 获取播放时长
|
||||
return ''
|
||||
|
||||
|
||||
def getActor(html): # 获取女优
|
||||
b = []
|
||||
for player in html.xpath('//div[@class="article-tags"]/a/text()'):
|
||||
b.append(player)
|
||||
return b
|
||||
|
||||
|
||||
def getUrl(html):
|
||||
return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
|
||||
|
||||
|
||||
def getNum(url, number): # 获取番号
|
||||
try:
|
||||
# 解码url
|
||||
filename = unquote(urlparse(url).path)
|
||||
# 裁剪文件名
|
||||
result = filename[1:-5].upper().strip()
|
||||
print(result)
|
||||
# 移除中文
|
||||
if result.upper() != number.upper():
|
||||
result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
|
||||
# 移除多余的符号
|
||||
return result.strip('-')
|
||||
except:
|
||||
return ''
|
||||
|
||||
|
||||
def getDirector(html): # 获取导演 已修改
|
||||
return ''
|
||||
|
||||
|
||||
def getOutline(html): # 获取概述
|
||||
return ''
|
||||
|
||||
|
||||
def getSerise(html): # 获取系列 已修改
|
||||
return ''
|
||||
|
||||
|
||||
def getTag(html): # 获取标签
|
||||
return html.xpath('//div[@class="article-tags"]/a/text()')
|
||||
|
||||
|
||||
def getExtrafanart(html): # 获取剧照
|
||||
return ''
|
||||
|
||||
|
||||
def main(number):
|
||||
try:
|
||||
try:
|
||||
number = number.lower()
|
||||
url = "https://madou.club/" + number + ".html"
|
||||
htmlcode = get_html(url)
|
||||
except:
|
||||
print(number)
|
||||
|
||||
html = etree.fromstring(htmlcode, etree.HTMLParser())
|
||||
url = getUrl(html)
|
||||
dic = {
|
||||
# 标题
|
||||
'title': getTitle(html, number),
|
||||
# 制作商
|
||||
'studio': getStudio(html),
|
||||
# 年份
|
||||
'year': getYear(html),
|
||||
# 简介
|
||||
'outline': getOutline(html),
|
||||
#
|
||||
'runtime': getRuntime(html),
|
||||
# 导演
|
||||
'director': getDirector(html),
|
||||
# 演员
|
||||
'actor': getActor(html),
|
||||
# 发售日
|
||||
'release': getRelease(html),
|
||||
# 番号
|
||||
'number': getNum(url, number),
|
||||
# 封面链接
|
||||
'cover': getCover(htmlcode),
|
||||
# 剧照获取
|
||||
'extrafanart': getExtrafanart(html),
|
||||
'imagecut': 1,
|
||||
#
|
||||
'tag': getTag(html),
|
||||
#
|
||||
'label': getSerise(html),
|
||||
# 作者图片
|
||||
'website': url,
|
||||
'source': 'madou.py',
|
||||
# 使用
|
||||
'series': getSerise(html)
|
||||
}
|
||||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
|
||||
indent=4, separators=(',', ':'), ) # .encode('UTF-8')
|
||||
return js
|
||||
except Exception as e:
|
||||
if config.getInstance().debug():
|
||||
print(e)
|
||||
data = {
|
||||
"title": "",
|
||||
}
|
||||
js = json.dumps(
|
||||
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
|
||||
)
|
||||
return js
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(main('MD-0147'))
|
||||
print(main('MD0147'))
|
||||
Reference in New Issue
Block a user