Merge branch 'yoshiko2_master'

# Conflicts: # WebCrawler/javdb.py
2022-03-10 21:51:52 +08:00
parent 61de7863ed 9e332b0d02
commit edfddc18d8
17 changed files with 925 additions and 306 deletions
--- a/WebCrawler/init.py
+++ b/WebCrawler/init.py
@@ -22,6 +22,8 @@ from . import xcity
 from . import dlsite
 from . import carib
 from . import fc2club
+from . import mv91
+from . import madou


 def get_data_state(data: dict) -> bool:  # 元数据获取失败检测
@@ -36,9 +38,10 @@ def get_data_state(data: dict) -> bool:  # 元数据获取失败检测

    return True

-def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
+
+def get_data_from_json(file_number, oCC):
    """
-    iterate through all services and fetch the data
+    iterate through all services and fetch the data 从JSON返回元数据
    """

    actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
@@ -57,13 +60,15 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
        # "javlib": javlib.main,
        "dlsite": dlsite.main,
        "carib": carib.main,
-        "fc2club": fc2club.main
+        "fc2club": fc2club.main,
+        "mv91": mv91.main,
+        "madou": madou.main
    }

    conf = config.getInstance()
    # default fetch order list, from the beginning to the end
    sources = conf.sources().split(',')
-    if not len(conf.sources()) > 80:
+    if len(sources) <= len(func_mapping):
        # if the input file name matches certain rules,
        # move some web service to the beginning of the list
        lo_file_number = file_number.lower()
@@ -231,8 +236,8 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
    json_data['studio'] = studio
    json_data['director'] = director

-    if conf.is_transalte():
-        translate_values = conf.transalte_values().split(",")
+    if conf.is_translate():
+        translate_values = conf.translate_values().split(",")
        for translate_value in translate_values:
            if json_data[translate_value] == "":
                continue
@@ -244,12 +249,12 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
                    continue
                except:
                    pass
-            if conf.get_transalte_engine() == "azure":
+            if conf.get_translate_engine() == "azure":
                t = translate(
                    json_data[translate_value],
                    target_language="zh-Hans",
-                    engine=conf.get_transalte_engine(),
-                    key=conf.get_transalte_key(),
+                    engine=conf.get_translate_engine(),
+                    key=conf.get_translate_key(),
                )
            else:
                t = translate(json_data[translate_value])
@@ -270,7 +275,7 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
            if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
                return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
            else:
-                return vars
+                raise IndexError('keyword not found')
        for cc in cc_vars:
            if json_data[cc] == "" or len(json_data[cc]) == 0:
                continue
@@ -298,20 +303,20 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
                        json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
                    elif ccm == 3:
                        json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
-                        json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
                except:
                    json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
            else:
                try:
                    if ccm == 1:
                        json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
-                        json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
                    elif ccm == 2:
                        json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
-                        json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
                    elif ccm == 3:
                        json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
-                        json_data[cc] = ADC_function.delete_list_all_elements("删除", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
                except IndexError:
                    json_data[cc] = oCC.convert(json_data[cc])
                except:
@@ -322,11 +327,13 @@ def get_data_from_json(file_number, oCC):  # 从JSON返回元数据
        if i not in json_data:
            naming_rule += i.strip("'").strip('"')
        else:
-            naming_rule += json_data.get(i)
+            item = json_data.get(i)
+            naming_rule += item if type(item) is not list else "&".join(item)

    json_data['naming_rule'] = naming_rule
    return json_data

+
 def special_characters_replacement(text) -> str:
    if not isinstance(text, str):
        return text
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -139,6 +139,7 @@ def getCover_small(html, index=0):
 def getTrailer(htmlcode):  # 获取预告片
    video_pather = re.compile(r'<video id\=\".*?>\s*?<source src=\"(.*?)\"')
    video = video_pather.findall(htmlcode)
+    # 加上数组判空
    if video and video[0] != "":
        if not 'https:' in video[0]:
            video_url = 'https:' + video[0]
@@ -263,16 +264,14 @@ def main(number):
            # replace wit normal cover and cut it
            imagecut = 1
            cover_small = getCover(lx)
-
        dp_number = getNum(lx)
-        if dp_number.upper() != number:
-            raise ValueError("number not found")
+        if dp_number.upper() != number.upper():
+            raise ValueError("number not eq"+dp_number)
        title = getTitle(lx)
        if title and dp_number:
            number = dp_number
            # remove duplicate title
            title = title.replace(number, '').strip()
-
        dic = {
            'actor': getActor(lx),
            'title': title,
@@ -325,7 +324,7 @@ if __name__ == "__main__":
    # print(main('FC2-1174949')) # not found
    #print(main('MVSD-439'))
    # print(main('EHM0001')) # not found
-    print(main('032517_505'))
+    print(main('FC2-2314275'))
    # print(main('EBOD-646'))
    # print(main('LOVE-262'))
    #print(main('ABP-890'))
--- a/WebCrawler/madou.py
+++ b/WebCrawler/madou.py
@@ -0,0 +1,164 @@
+from bs4 import BeautifulSoup  # need install
+from lxml import etree  # need install
+from pyquery import PyQuery as pq  # need install
+from ADC_function import *
+import json
+import re
+from lib2to3.pgen2 import parse
+import sys
+
+from urllib.parse import urlparse, unquote
+sys.path.append('../')
+
+
+def getActorPhoto(html):
+    return ''
+
+
+def getTitle(html, number):  # 获取标题
+    title = str(html.xpath('//h1[@class="article-title"]/text()')[0])
+    try:
+        result = str(re.split(r'[/|／|-]', title)[1])
+        return result.strip()
+    except:
+        return title.replace(number.upper(), '').strip()
+
+
+def getStudio(html):  # 获取厂商 已修改
+    try:
+        category = str(html.xpath('//a[@rel="category tag"]/text()')[0])
+        return category.strip()
+    except:
+        return '麻豆社'
+
+
+def getYear(html):  # 获取年份
+    return ''
+
+
+def getCover(htmlcode):  # 获取封面图片
+    try:
+        url = str(re.findall("shareimage      : '(.*?)'", htmlcode)[0])
+        return url.strip()
+    except:
+        return ''
+
+
+def getRelease(html):  # 获取出版日期
+    return ''
+
+
+def getRuntime(html):  # 获取播放时长
+    return ''
+
+def getUrl(html):
+    return str(html.xpath('//a[@class="share-weixin"]/@data-url')[0])
+
+
+def getNum(url, number):  # 获取番号
+    try:
+        # 解码url
+        filename = unquote(urlparse(url).path)
+        # 裁剪文件名
+        result = filename[1:-5].upper().strip()
+        print(result)
+        # 移除中文
+        if result.upper() != number.upper():
+            result = re.split(r'[^\x00-\x7F]+', result, 1)[0]
+        # 移除多余的符号
+        return result.strip('-')
+    except:
+        return ''
+
+
+def getDirector(html):  # 获取导演 已修改
+    return ''
+
+
+def getOutline(html):  # 获取概述
+    return ''
+
+
+def getSerise(html):  # 获取系列 已修改
+    return ''
+
+
+def getTag(html):  # 获取标签
+    return html.xpath('//div[@class="article-tags"]/a/text()')
+
+
+def getExtrafanart(html):  # 获取剧照
+    return ''
+
+def cutTags(tags):
+    actors = []
+    tags = []
+    for tag in tags:
+        actors.append(tag)
+    return actors,tags
+
+
+def main(number):
+    try:
+        try:
+            number = number.lower().strip()
+            url = "https://madou.club/" + number + ".html"
+            htmlcode = get_html(url)
+        except:
+            print(number)
+
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        url = getUrl(html)
+        tags = getTag(html)
+        actor,tags = cutTags(tags);
+        dic = {
+            # 标题
+            'title': getTitle(html, number),
+            # 制作商
+            'studio': getStudio(html),
+            # 年份
+            'year': getYear(html),
+            # 简介
+            'outline': getOutline(html),
+            #
+            'runtime': getRuntime(html),
+            # 导演
+            'director': getDirector(html),
+            # 演员
+            'actor': actor,
+            # 发售日
+            'release': getRelease(html),
+            # 番号
+            'number': getNum(url, number),
+            # 封面链接
+            'cover': getCover(htmlcode),
+            # 剧照获取
+            'extrafanart': getExtrafanart(html),
+            'imagecut': 1,
+            #
+            'tag': tags,
+            #
+            'label': getSerise(html),
+            # 作者图片
+            'website': url,
+            'source': 'madou.py',
+            # 使用
+            'series': getSerise(html)
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True,
+                        indent=4, separators=(',', ':'), )  # .encode('UTF-8')
+        return js
+    except Exception as e:
+        if config.getInstance().debug():
+            print(e)
+        data = {
+            "title": "",
+        }
+        js = json.dumps(
+            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+        )
+        return js
+
+
+if __name__ == '__main__':
+    print(main('MD0094'))
--- a/WebCrawler/mv91.py
+++ b/WebCrawler/mv91.py
@@ -0,0 +1,158 @@
+import sys
+sys.path.append('../')
+import re
+from pyquery import PyQuery as pq#need install
+from lxml import etree#need install
+from bs4 import BeautifulSoup#need install
+import json
+from ADC_function import *
+
+
+host = 'https://www.91mv.org'
+
+def getActorPhoto(html):
+    return ''
+
+def getTitle(html):  #获取标题
+    try:
+        title = str(html.xpath('//div[@class="player-title"]/text()')[0])
+        result = str(re.findall('(.*)(91.*-\d*)',title)[0][0])
+        return result.strip()
+    except:
+        return ''
+
+def getStudio(html): #获取厂商 已修改
+    return '91制片厂'
+
+def getYear(html):   #获取年份
+    try:
+        result = str(html.xpath('//p[@class="date"]/text()')[0])
+        date = result.replace('日期：','')
+        if isinstance(date, str) and len(date):
+            return date
+    except:
+        return ''
+    return ''
+
+def getCover(htmlcode):  #获取封面图片
+    try:
+        url = str(re.findall('var pic_url = "(.*?)"',htmlcode)[0])
+        return url.strip()
+    except:
+        return ''
+
+def getRelease(html): #获取出版日期
+    try:
+        result = str(html.xpath('//p[@class="date"]/text()')[0])
+        date = result.replace('日期：','')
+        if isinstance(date, str) and len(date):
+            return date
+    except:
+        return ''
+    return ''
+
+def getRuntime(htmlcode): #获取播放时长
+    return ''
+
+def getActor(html):   #获取女优
+    b=[]
+    for player in html.xpath('//p[@class="player-name"]/text()'):
+        player = player.replace('主演：','')
+        b.append(player)
+    return b
+
+def getNum(html):     #获取番号
+    try:
+        title = str(html.xpath('//div[@class="player-title"]/text()')[0])
+        result = str(re.findall('(.*)(91.*-\d*)',title)[0][1])
+        return result.strip()
+    except:
+        return ''
+
+def getDirector(html): #获取导演 已修改
+    return ''
+
+def getOutline(html):  #获取概述
+    try:
+        result = str(html.xpath('//div[@class="play-text"]/text()')[0])
+        return result.strip()
+    except:
+        return ''
+  
+
+def getSerise(htmlcode):   #获取系列 已修改
+    return ''
+
+def getTag(html):  # 获取标签
+    return html.xpath('//div[@class="player-tag"]/text()')
+
+def getExtrafanart(htmlcode):  # 获取剧照
+    return ''
+
+def search(keyword): #搜索，返回结果
+    search_html = get_html(host + '/index/search?keywords=' + keyword)
+    html = etree.fromstring(search_html, etree.HTMLParser())
+    return html.xpath('//a[@class="video-list"]/@href')[0]
+
+def main(number):
+    try:
+        try:
+            number = number.replace('91CM-','').replace('91MS-','')
+            url = host + str(search(number))
+            htmlcode = get_html(url)
+        except:
+            print(number)
+        html = etree.fromstring(htmlcode, etree.HTMLParser())
+        dic = {
+            # 标题
+            'title': getTitle(html),
+            # 制作商
+            'studio': getStudio(html),
+            # 年份
+            'year': getYear(html),
+            # 简介
+            'outline': getOutline(html),
+            # 
+            'runtime': getRuntime(html),
+            # 导演 
+            'director': getDirector(html),
+            # 演员 
+            'actor': getActor(html),
+            # 发售日
+            'release': getRelease(html),
+            # 番号
+            'number': getNum(html),
+            # 封面链接
+            'cover': getCover(htmlcode),
+            # 剧照获取
+            'extrafanart': getExtrafanart(html),
+            'imagecut': 1,
+            # 
+            'tag': getTag(html),
+            # 
+            'label': getSerise(html),
+            # 作者图片
+            'website': url,
+            'source': 'mv91.py',
+            # 使用
+            'series': getSerise(html)
+        }
+        js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )  # .encode('UTF-8')
+        return js
+    except Exception as e:
+        if config.getInstance().debug():
+            print(e)
+        data = {
+            "title": "",
+        }
+        js = json.dumps(
+            data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
+        )
+        return js
+
+
+if __name__ == '__main__':
+    print(main('91CM-121'))
+    print(main('91CM-122'))
+    print(main('91CM-143'))
+    print(main('91MS-006'))