支持91制片室和麻豆，优化图片裁剪功能添加了人脸识别模块

2022-01-30 03:37:08 +08:00
parent 9a9d36672f
commit a84452ba1c
8 changed files with 449 additions and 45 deletions
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -22,6 +22,8 @@ from . import xcity
 from . import dlsite
 from . import carib
 from . import fc2club
+from . import mv91
+from . import madou


 def get_data_state(data: dict) -> bool:  # 元数据获取失败检测
@@ -57,7 +59,9 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
        # "javlib": javlib.main,
        "dlsite": dlsite.main,
        "carib": carib.main,
-        "fc2club": fc2club.main
+        "fc2club": fc2club.main,
+        "mv91": mv91.main,
+        "madou": madou.main
    }

    conf = config.getInstance()
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -1,3 +1,4 @@
+import logging
 import sys
 sys.path.append('../')
 import re
@@ -139,7 +140,8 @@ def getCover_small(html, index=0):
 def getTrailer(htmlcode):  # 获取预告片
    video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
    video = video_pather.findall(htmlcode)
-    if video[0] != "":
+    # 加上数组判空
+    if video and video[0] != "":
        if not 'https:' in video[0]:
            video_url = 'https:' + video[0]
        else:
@@ -263,16 +265,14 @@ def main(number):
            # replace wit normal cover and cut it
            imagecut = 1
            cover_small = getCover(lx)
-
        dp_number = getNum(lx)
-        if dp_number.upper() != number:
-            raise ValueError("number not found")
+        if dp_number.upper() != number.upper():
+            raise ValueError("number not eq"+dp_number)
        title = getTitle(lx)
        if title and dp_number:
            number = dp_number
            # remove duplicate title
            title = title.replace(number, '').strip()
-
        dic = {
            'actor': getActor(lx),
            'title': title,
--- a/WebCrawler/madou.py
+++ b/WebCrawler/madou.py
@@ -0,0 +1,164 @@
+from bs4 import BeautifulSoup  # need install
+from lxml import etree  # need install
+from pyquery import PyQuery as pq  # need install
+from ADC_function import *
+import json
+import re
+from lib2to3.pgen2 import parse
+import sys
+
+from urllib.parse import urlparse, unquote
+sys.path.append('../')
+
+
+def getActorPhoto(html):
+    return ''
+
+
+def getTitle(html, number):  # 获取标题
+    title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
+    try:
+        result = str(re.split(r'[/|／|-]', title)[1])
+        return result.strip()
+    except:
+        return title.replace(number.upper(), '').strip()
+
+
+def getStudio(html):  # 获取厂商 已修改
+    try:
+        category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
+        return category.strip()
+    except:
+        return '麻豆社'
+
+
+def getYear(html):  # 获取年份
+    return ''
+
+
+def getCover(htmlcode):  # 获取封面图片
+    try:
+        url = str(re.findall("shareimage      : '(.*?)'", htmlcode)[0])
+        return url.strip()
+    except:
+        return ''
+
+
+def getRelease(html):  # 获取出版日期
+    return ''
+
+
+def getRuntime(html):  # 获取播放时长
+    return ''
+
+
+def getActor(html):  # 获取女优
+    b = []
+    for player in html.xpath('//div[@class="article-tags"]/a/text()'):
+        b.append(player)
+    return b
+
+
+def getUrl(html):
+    return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
+
+
+def getNum(url, number):  # 获取番号
+    try:
+        # 解码url
+        filename = unquote(urlparse(url).path)
+        # 裁剪文件名
+        result = filename[1:-5].upper().strip()
+        print(result)
+        # 移除中文
+        if result.upper() != number.upper():
+            result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
+        # 移除多余的符号
+        return result.strip('-')
+    except:
+        return ''
+
+
+def getDirector(html):  # 获取导演 已修改
+    return ''
+
+
+def getOutline(html):  # 获取概述
+    return ''
+
+
+def getSerise(html):  # 获取系列 已修改
+    return ''
+
+
+def getTag(html):  # 获取标签
+    return html.xpath('//div[@class="article-tags"]/a/text()')
+
+
+def getExtrafanart(html):  # 获取剧照
+    return ''
+
+
+def main(number):
+    try:
+        try:
+            number = number.lower()
+            url = "https://madou.club/" + number + ".html"
+            htmlcode = get_html(url)
+        except:
+            print(number)
+
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        url = getUrl(html)
+        dic = {
+            # 标题
+            'title': getTitle(html, number),
+            # 制作商
+            'studio': getStudio(html),
+            # 年份
+            'year': getYear(html),
+            # 简介
+            'outline': getOutline(html),
+            #
+            'runtime': getRuntime(html),
+            # 导演
+            'director': getDirector(html),
+            # 演员
+            'actor': getActor(html),
+            # 发售日
+            'release': getRelease(html),
+            # 番号
+            'number': getNum(url, number),
+            # 封面链接
+            'cover': getCover(htmlcode),
+            # 剧照获取
+            'extrafanart': getExtrafanart(html),
+            'imagecut': 1,
+            #
+            'tag': getTag(html),
+            #
+            'label': getSerise(html),
+            # 作者图片
+            'website': url,
+            'source': 'madou.py',
+            # 使用
+            'series': getSerise(html)
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
+                        indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+        return js
+    except Exception as e:
+        if config.getInstance().debug():
+            print(e)
+        data = {
+            "title": "",
+        }
+        js = json.dumps(
+            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+        )
+        return js
+
+
+if __name__ == '__main__':
+    print(main('MD-0147'))
+    print(main('MD0147'))
--- a/WebCrawler/mv91.py
+++ b/WebCrawler/mv91.py
@@ -0,0 +1,158 @@
+import sys
+sys.path.append('../')
+import re
+from pyquery import PyQuery as pq#need install
+from lxml import etree#need install
+from bs4 import BeautifulSoup#need install
+import json
+from ADC_function import *
+
+
+host = 'https://www.91mv.org'
+
+def getActorPhoto(html):
+    return ''
+
+def getTitle(html):  #获取标题
+    try:
+        title = str(html.xpath('//div[@class="player-title"]/text()')[0])
+        result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
+        return result.strip()
+    except:
+        return ''
+
+def getStudio(html): #获取厂商 已修改
+    return '91制片厂'
+
+def getYear(html):   #获取年份
+    try:
+        result = str(html.xpath('//p[@class="date"]/text()')[0])
+        date = result.replace('日期：','')
+        if isinstance(date, str) and len(date):
+            return date
+    except:
+        return ''
+    return ''
+
+def getCover(htmlcode):  #获取封面图片
+    try:
+        url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
+        return url.strip()
+    except:
+        return ''
+
+def getRelease(html): #获取出版日期
+    try:
+        result = str(html.xpath('//p[@class="date"]/text()')[0])
+        date = result.replace('日期：','')
+        if isinstance(date, str) and len(date):
+            return date
+    except:
+        return ''
+    return ''
+
+def getRuntime(htmlcode): #获取播放时长
+    return ''
+
+def getActor(html):   #获取女优
+    b=[]
+    for player in html.xpath('//p[@class="player-name"]/text()'):
+        player = player.replace('主演：','')
+        b.append(player)
+    return b
+
+def getNum(html):     #获取番号
+    try:
+        title = str(html.xpath('//div[@class="player-title"]/text()')[0])
+        result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
+        return result.strip()
+    except:
+        return ''
+
+def getDirector(html): #获取导演 已修改
+    return ''
+
+def getOutline(html):  #获取概述
+    try:
+        result = str(html.xpath('//div[@class="play-text"]/text()')[0])
+        return result.strip()
+    except:
+        return ''
+  
+
+def getSerise(htmlcode):   #获取系列 已修改
+    return ''
+
+def getTag(html):  # 获取标签
+    return html.xpath('//div[@class="player-tag"]/text()')
+
+def getExtrafanart(htmlcode):  # 获取剧照
+    return ''
+
+def search(keyword): #搜索，返回结果
+    search_html = get_html(host + '/index/search?keywords=' + keyword)
+    html = etree.fromstring(search_html, etree.HTMLParser())
+    return html.xpath('//a[@class="video-list"]/@href')[0]
+
+def main(number):
+    try:
+        try:
+            number = number.replace('91CM-','').replace('91MS-','')
+            url = host + str(search(number))
+            htmlcode = get_html(url)
+        except:
+            print(number)
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        dic = {
+            # 标题
+            'title': getTitle(html),
+            # 制作商
+            'studio': getStudio(html),
+            # 年份
+            'year': getYear(html),
+            # 简介
+            'outline': getOutline(html),
+            # 
+            'runtime': getRuntime(html),
+            # 导演 
+            'director': getDirector(html),
+            # 演员 
+            'actor': getActor(html),
+            # 发售日
+            'release': getRelease(html),
+            # 番号
+            'number': getNum(html),
+            # 封面链接
+            'cover': getCover(htmlcode),
+            # 剧照获取
+            'extrafanart': getExtrafanart(html),
+            'imagecut': 1,
+            # 
+            'tag': getTag(html),
+            # 
+            'label': getSerise(html),
+            # 作者图片
+            'website': url,
+            'source': 'mv91.py',
+            # 使用
+            'series': getSerise(html)
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )  # .encode('UTF-8')
+        return js
+    except Exception as e:
+        if config.getInstance().debug():
+            print(e)
+        data = {
+            "title": "",
+        }
+        js = json.dumps(
+            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+        )
+        return js
+
+
+if __name__ == '__main__':
+    print(main('91CM-121'))
+    print(main('91CM-122'))
+    print(main('91CM-143'))
+    print(main('91MS-006'))
--- a/config.ini
+++ b/config.ini
@@ -112,4 +112,8 @@ mode=1
 vars=outline,series,studio,tag,title

 [javdb]
-sites=33,34
+sites=33,34
+
+; 人脸识别 hog:方向梯度直方图(不太准确，速度快) cnn:深度学习模型(准确，需要GPU/CUDA,速度慢)
+[face]
+locations_model=hog
--- a/config.py
+++ b/config.py
@@ -314,6 +314,22 @@ class Config:
        except:
            return "33,34"

+    def face_locations_model(self) -> str:
+        try:
+            return self.conf.get("face", "locations_model")
+        except:
+            return "hog"
+    
+    def face_app_id(self) -> str:
+        return self.conf.get("face", "appid")
+
+    def face_api_key(self) -> str:
+        return self.conf.get("face", "key")
+
+    def face_app_secret(self) -> str:
+        return self.conf.get("face", "secret")
+
+
    @staticmethod
    def _exit(sec: str) -> None:
        print("[-] Read config error! Please check the {} section in config.ini", sec)
--- a/core.py
+++ b/core.py
@@ -5,6 +5,7 @@ import re
 import shutil
 import sys

+
 from PIL import Image
 from io import BytesIO
 from pathlib import Path
@@ -14,6 +15,7 @@ from ADC_function import *
 from WebCrawler import get_data_from_json
 from number_parser import is_uncensored

+
 def escape_path(path, escape_literals: str):  # Remove escape literals
    backslash = '\\'
    for literal in escape_literals:
@@ -245,14 +247,18 @@ def extrafanart_download_threadpool(url_list, save_dir, number):
    if conf.debug():
        print(f'[!]Extrafanart download ThreadPool mode runtime {time.perf_counter() - tm_start:.3f}s')

+def image_ext(url):
+    try:
+        return os.path.splitext(url)[-1]
+    except:
+        return ".jpg"

 # 封面是否下载成功，否则移动到failed
-def image_download(cover, number, leak_word, c_word, hack_word, path, filepath):
-    filename = f"{number}{leak_word}{c_word}{hack_word}-fanart.jpg"
-    full_filepath = os.path.join(path, filename)
+def image_download(cover, fanart_path,thumb_path, path, filepath):
+    full_filepath = os.path.join(path, fanart_path)
    if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
        return
-    if download_file_with_filename(cover, filename, path, filepath) == 'failed':
+    if download_file_with_filename(cover, fanart_path, path, filepath) == 'failed':
        moveFailedFolder(filepath)
        return

@@ -260,17 +266,17 @@ def image_download(cover, number, leak_word, c_word, hack_word, path, filepath):
    for i in range(configProxy.retry):
        if file_not_exist_or_empty(full_filepath):
            print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
-            download_file_with_filename(cover, filename, path, filepath)
+            download_file_with_filename(cover, fanart_path, path, filepath)
            continue
        else:
            break
    if file_not_exist_or_empty(full_filepath):
        return
    print('[+]Image Downloaded!', full_filepath)
-    shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-thumb.jpg"))
+    shutil.copyfile(full_filepath, os.path.join(path, thumb_path))


-def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, hack_word):
+def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, hack_word,fanart_path,poster_path,thumb_path):
    title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
    if config.getInstance().main_mode() == 3:  # 模式3下，由于视频文件不做任何改变，.nfo文件必须和视频文件名称除后缀外完全一致，KODI等软件方可支持
        nfo_path = str(Path(filepath).with_suffix('.nfo'))
@@ -303,9 +309,9 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
            print("  <plot><![CDATA[" + outline + "]]></plot>", file=code)
            print("  <runtime>" + str(runtime).replace(" ", "") + "</runtime>", file=code)
            print("  <director>" + director + "</director>", file=code)
-            print("  <poster>" + number + leak_word + c_word + hack_word + "-poster.jpg</poster>", file=code)
-            print("  <thumb>" + number + leak_word + c_word + hack_word + "-thumb.jpg</thumb>", file=code)
-            print("  <fanart>" + number + leak_word + c_word + hack_word + '-fanart.jpg' + "</fanart>", file=code)
+            print("  <poster>" + poster_path + "</poster>", file=code)
+            print("  <thumb>" + thumb_path + "</thumb>", file=code)
+            print("  <fanart>" + fanart_path +  "</fanart>", file=code)
            try:
                for key in actor_list:
                    print("  <actor>", file=code)
@@ -365,23 +371,70 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
        return


-def cutImage(imagecut, path, number, leak_word, c_word, hack_word):
-    fullpath_noext = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}")
-    if imagecut == 1: # 剪裁大封面
+def face_center(filename, model):
+    print('[+]Image found face  ' + model)
+    try:
+            import face_recognition
+            image = face_recognition.load_image_file(filename)
+            face_locations = face_recognition.face_locations(image, 0, model)
+            if face_locations:
+                top, right, bottom, left = face_locations[0]
+                # 中心点
+                return int((right+left)/2)
+    except Exception as e:
+        print("[-]", e)
+    return 0
+
+
+def face_crop(filename, width, height):
+    # 新宽度是高度的2/3
+    cropWidthHalf = int(height/3)
+    try:
+        locations_model = filter(lambda x : x,config.getInstance().face_locations_model().lower().split(','))
+        for model in locations_model:
+            center = face_center(filename, model)
+            # 如果找到就跳出循环
+            if center:
+                cropLeft = center-cropWidthHalf
+                cropRight = center+cropWidthHalf
+                # 越界处理
+                if cropLeft < 0:
+                    cropLeft = 0
+                    cropRight = cropWidthHalf*2
+                elif cropRight > width:
+                    cropLeft = width-cropWidthHalf*2
+                    cropRight = width
+                return (cropLeft, 0, cropRight, height)
+    except:
+        print('[-]Not found face!   ' + filename)
+    # 默认靠右切
+    return (width-cropWidthHalf*2, 0, width, height)
+
+
+def cutImage(imagecut, path, fanart_path, poster_path):
+    fullpath_fanart = os.path.join(path, fanart_path)
+    fullpath_poster = os.path.join(path, poster_path)
+    if imagecut == 1:  # 剪裁大封面
        try:
-            img = Image.open(fullpath_noext + '-fanart.jpg')
-            imgSize = img.size
-            w = img.width
-            h = img.height
-            img2 = img.crop((w / 1.9, 0, w, h))
-            img2.save(fullpath_noext + '-poster.jpg')
-            print('[+]Image Cutted!     ' + fullpath_noext + '-poster.jpg')
+            img = Image.open(fullpath_fanart)
+            width, height = img.size
+            if width/height > 2/3:  # 如果宽度大于2
+                # 以人像为中心切取
+                img2 = img.crop(face_crop(fullpath_fanart, width, height))
+            elif width/height < 2/3:  # 如果高度大于3
+                # 从底部向上切割
+                cropBottom = width*3/2
+                img2 = img.crop(0, 0, width, cropBottom)
+            else:  # 如果等于2/3
+                img2 = img
+            img2.save(fullpath_poster)
+            print('[+]Image Cutted!     ' + fullpath_poster)
        except Exception as e:
            print(e)
            print('[-]Cover cut failed!')
-    elif imagecut == 0: # 复制封面
-        shutil.copyfile(fullpath_noext + '-fanart.jpg', fullpath_noext + '-poster.jpg')
-        print('[+]Image Copyed!     ' + fullpath_noext + '-poster.jpg')
+    elif imagecut == 0:  # 复制封面
+        shutil.copyfile(fullpath_fanart, fullpath_poster)
+        print('[+]Image Copyed!     ' + fullpath_poster)

 # 此函数从gui版copy过来用用
 # 参数说明
@@ -652,6 +705,12 @@ def core_main(file_path, number_th, oCC):
    # 创建文件夹
    #path = create_folder(rootpath + '/' + conf.success_folder(),  json_data.get('location_rule'), json_data)

+
+    cover = json_data.get('cover')
+    ext = image_ext(cover)
+    fanart_path =  f"{number}{leak_word}{c_word}{hack_word}-fanart{ext}"
+    poster_path = f"{number}{leak_word}{c_word}{hack_word}-poster{ext}"
+    thumb_path =  f"{number}{leak_word}{c_word}{hack_word}-thumb{ext}"
    # main_mode
    #  1: 刮削模式 / Scraping mode
    #  2: 整理模式 / Organizing mode
@@ -666,8 +725,9 @@ def core_main(file_path, number_th, oCC):
        if imagecut == 3:
            small_cover_check(path, number,  json_data.get('cover_small'), leak_word, c_word, hack_word, filepath)

+
        # creatFolder会返回番号路径
-        image_download( json_data.get('cover'), number, leak_word, c_word, hack_word, path, filepath)
+        image_download( cover, fanart_path,thumb_path, path, filepath)

        if not multi_part or part.lower() == '-cd1':
            try:
@@ -683,30 +743,29 @@ def core_main(file_path, number_th, oCC):
            except:
                pass

+       
+
        # 裁剪图
-        cutImage(imagecut, path, number, leak_word, c_word, hack_word)
+        cutImage(imagecut, path , fanart_path, poster_path)

        # 添加水印
-        poster_path = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-poster.jpg")
-        thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-thumb.jpg")
        if conf.is_watermark():
-            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack)
+            add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)

        # 移动电影
        paste_file_to_folder(filepath, path, number, leak_word, c_word, hack_word)

        # 最后输出.nfo元数据文件，以完成.nfo文件创建作为任务成功标志
-        print_files(path, leak_word, c_word,  json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag,  json_data.get('actor_list'), liuchu, uncensored, hack_word)
+        print_files(path, leak_word, c_word,  json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag,  json_data.get('actor_list'), liuchu, uncensored, hack_word
+        ,fanart_path,poster_path,thumb_path)

    elif conf.main_mode() == 2:
        # 创建文件夹
        path = create_folder(json_data)
        # 移动文件
        paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, hack_word)
-        poster_path = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-poster.jpg")
-        thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-thumb.jpg")
        if conf.is_watermark():
-            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack)
+            add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)

    elif conf.main_mode() == 3:
        path = str(Path(file_path).parent)
@@ -718,7 +777,7 @@ def core_main(file_path, number_th, oCC):
            small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, hack_word, filepath)

        # creatFolder会返回番号路径
-        image_download(json_data.get('cover'), number, leak_word, c_word, hack_word, path, filepath)
+        image_download( cover, fanart_path,thumb_path, path, filepath)

        if not multi_part or part.lower() == '-cd1':
            # 下载预告片
@@ -730,14 +789,12 @@ def core_main(file_path, number_th, oCC):
                extrafanart_download(json_data.get('extrafanart'), path, number, filepath)

        # 裁剪图
-        cutImage(imagecut, path, number, leak_word, c_word, hack_word)
+        cutImage(imagecut, path , fanart_path, poster_path)

        # 添加水印
-        poster_path = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-poster.jpg")
-        thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}{hack_word}-thumb.jpg")
        if conf.is_watermark():
-            add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, hack)
+            add_mark(os.path.join(path,poster_path), os.path.join(path,thumb_path), cn_sub, leak, uncensored, hack)

        # 最后输出.nfo元数据文件，以完成.nfo文件创建作为任务成功标志
        print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
-                    tag, json_data.get('actor_list'), liuchu, uncensored, hack_word)
+                    tag, json_data.get('actor_list'), liuchu, uncensored, hack_word,fanart_path,poster_path,thumb_path)
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,4 @@ urllib3==1.24.3
 certifi==2020.12.5
 MechanicalSoup==1.1.0
 opencc-python-reimplemented
+face_recognition