move to scrapinglib

2022-05-26 14:10:26 +08:00
parent b7ecb66210
commit d6d0a1687b
2 changed files with 388 additions and 1 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,386 @@
+import json
+import re
+from multiprocessing.pool import ThreadPool
+
+import ADC_function
+import config
+from ADC_function import translate
+from lxml import etree
+from pathlib import Path
+
+from scrapinglib.api import search
+
+# =========website========
+# from . import airav
+# from . import avsox
+# from . import fanza
+# from . import fc2
+# from . import jav321
+# from . import javbus
+# from . import javdb
+# from . import mgstage
+# from . import xcity
+# # from . import javlib
+# from . import dlsite
+# from . import carib
+# from . import fc2club
+# from . import mv91
+# from . import madou
+# from . import gcolle
+# from . import getchu
+
+
+# def get_data_state(data: dict) -> bool:  # 元数据获取失败检测
+#     if "title" not in data or "number" not in data:
+#         return False
+
+#     if data["title"] is None or data["title"] == "" or data["title"] == "null":
+#         return False
+
+#     if data["number"] is None or data["number"] == "" or data["number"] == "null":
+#         return False
+
+#     return True
+
+
+def get_data_from_json(file_number, oCC):
+    """
+    iterate through all services and fetch the data 从JSON返回元数据
+    """
+
+    actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
+    info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
+
+    # func_mapping = {
+    #     "airav": airav.main,
+    #     "avsox": avsox.main,
+    #     "fc2": fc2.main,
+    #     "fanza": fanza.main,
+    #     "javdb": javdb.main,
+    #     "javbus": javbus.main,
+    #     "mgstage": mgstage.main,
+    #     "jav321": jav321.main,
+    #     "xcity": xcity.main,
+    #     # "javlib": javlib.main,
+    #     "dlsite": dlsite.main,
+    #     "carib": carib.main,
+    #     "fc2club": fc2club.main,
+    #     "mv91": mv91.main,
+    #     "madou": madou.main,
+    #     "gcolle": gcolle.main,
+    #     "getchu": getchu.main,
+    # }
+
+    conf = config.getInstance()
+    # default fetch order list, from the beginning to the end
+    sources = conf.sources().split(',')
+    # def insert(sources,source):
+    #     if source in sources:
+    #         sources.insert(0, sources.pop(sources.index(source)))
+    #     return sources
+
+    # if len(sources) <= len(func_mapping):
+    #     # if the input file name matches certain rules,
+    #     # move some web service to the beginning of the list
+    #     lo_file_number = file_number.lower()
+    #     if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number)
+    #     ):
+    #         sources = insert(sources,"carib")
+    #     elif "item" in file_number or "GETCHU" in file_number.upper():
+    #         sources = insert(sources,"getchu")
+    #     elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number):
+    #         sources = insert(sources, "getchu")
+    #         sources = insert(sources, "dlsite")
+    #     elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
+    #         if "avsox" in sources:
+    #             sources = insert(sources,"avsox")
+    #     elif "mgstage" in sources and \
+    #             (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
+    #         sources = insert(sources,"mgstage")
+    #     elif "fc2" in lo_file_number:
+    #         if "fc2" in sources:
+    #             sources = insert(sources,"fc2")
+    #     elif "gcolle" in sources and (re.search("\d{6}", file_number)):
+    #         sources = insert(sources,"gcolle")
+    #     elif re.search(r"^[a-z0-9]{3,}$", lo_file_number):
+    #         if "xcity" in sources:
+    #             sources = insert(sources,"xcity")
+    #         if "madou" in sources:
+    #             sources = insert(sources,"madou")
+    #     elif "madou" in sources and (
+    #             re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number)
+    #     ):
+    #         sources = insert(sources,"madou")
+
+    # # check sources in func_mapping
+    # todel = []
+    # for s in sources:
+    #     if not s in func_mapping:
+    #         print('[!] Source Not Exist : ' + s)
+    #         todel.append(s)
+    # for d in todel:
+    #     print('[!] Remove Source : ' + s)
+    #     sources.remove(d)
+
+    # json_data = {}
+
+    # if conf.multi_threading():
+    #     pool = ThreadPool(processes=len(conf.sources().split(',')))
+
+    #     # Set the priority of multi-thread crawling and join the multi-thread queue
+    #     for source in sources:
+    #         pool.apply_async(func_mapping[source], (file_number,))
+
+    #     # Get multi-threaded crawling response
+    #     for source in sources:
+    #         if conf.debug() == True:
+    #             print('[+]select', source)
+    #         try:
+    #             json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
+    #         except:
+    #             json_data = pool.apply_async(func_mapping[source], (file_number,)).get()
+    #         # if any service return a valid return, break
+    #         if get_data_state(json_data):
+    #             print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
+    #             break
+    #     pool.close()
+    #     pool.terminate()
+    # else:
+    #     for source in sources:
+    #         try:
+    #             if conf.debug() == True:
+    #                 print('[+]select', source)
+    #             try:
+    #                 json_data = json.loads(func_mapping[source](file_number))
+    #             except:
+    #                 json_data = func_mapping[source](file_number)
+    #             # if any service return a valid return, break
+    #             if get_data_state(json_data):
+    #                 print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
+    #                 break
+    #         except:
+    #             break
+
+    # TODO 准备参数
+    # 1. javdb 的额外参数，cookies及sites区分
+    # 2. storyline sites参数
+    # 3. getchu仍在变更，未添加
+    # 4. 清理 ADC_function, webcrawler
+    # 5. ......
+    proxies = None
+    configProxy = conf.proxy()
+    if configProxy.enable:
+        proxies = configProxy.proxies()
+
+    json_data = search(file_number, sources, proxies=proxies)
+    # Return if data not found in all sources
+    if not json_data:
+        print('[-]Movie Number not found!')
+        return None
+
+    # 增加number严格判断，避免提交任何number，总是返回"本橋実来 ADZ335"，这种返回number不一致的数据源故障
+    # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
+    # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
+    # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分，可考虑更换规则，更新相应的number分析和抓取代码。
+    if str(json_data.get('number')).upper() != file_number.upper():
+        try:
+            if json_data.get('allow_number_change'):
+                pass
+        except:
+            print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
+            return None
+
+    # ================================================网站规则添加结束================================================
+
+    title = json_data.get('title')
+    actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',')  # 字符串转列表
+    actor_list = [actor.strip() for actor in actor_list]  # 去除空白
+    director = json_data.get('director')
+    release = json_data.get('release')
+    number = json_data.get('number')
+    studio = json_data.get('studio')
+    source = json_data.get('source')
+    runtime = json_data.get('runtime')
+    outline = json_data.get('outline')
+    label = json_data.get('label')
+    series = json_data.get('series')
+    year = json_data.get('year')
+
+    if json_data.get('cover_small'):
+        cover_small = json_data.get('cover_small')
+    else:
+        cover_small = ''
+
+    if json_data.get('trailer'):
+        trailer = json_data.get('trailer')
+    else:
+        trailer = ''
+
+    if json_data.get('extrafanart'):
+        extrafanart = json_data.get('extrafanart')
+    else:
+        extrafanart = ''
+
+    imagecut = json_data.get('imagecut')
+    tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',')  # 字符串转列表 @
+    while 'XXXX' in tag:
+        tag.remove('XXXX')
+    while 'xxx' in tag:
+        tag.remove('xxx')
+    actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
+
+    if title == '' or number == '':
+        print('[-]Movie Number or Title not found!')
+        return None
+
+    # if imagecut == '3':
+    #     DownloadFileWithFilename()
+
+    # ====================处理异常字符====================== #\/:*?"<>|
+    actor = special_characters_replacement(actor)
+    actor_list = [special_characters_replacement(a) for a in actor_list]
+    title = special_characters_replacement(title)
+    label = special_characters_replacement(label)
+    outline = special_characters_replacement(outline)
+    series = special_characters_replacement(series)
+    studio = special_characters_replacement(studio)
+    director = special_characters_replacement(director)
+    tag = [special_characters_replacement(t) for t in tag]
+    release = release.replace('/', '-')
+    tmpArr = cover_small.split(',')
+    if len(tmpArr) > 0:
+        cover_small = tmpArr[0].strip('\"').strip('\'')
+    # ====================处理异常字符 END================== #\/:*?"<>|
+
+    # 返回处理后的json_data
+    json_data['title'] = title
+    json_data['original_title'] = title
+    json_data['actor'] = actor
+    json_data['release'] = release
+    json_data['cover_small'] = cover_small
+    json_data['tag'] = tag
+    json_data['year'] = year
+    json_data['actor_list'] = actor_list
+    json_data['trailer'] = trailer
+    json_data['extrafanart'] = extrafanart
+    json_data['label'] = label
+    json_data['outline'] = outline
+    json_data['series'] = series
+    json_data['studio'] = studio
+    json_data['director'] = director
+
+    if conf.is_translate():
+        translate_values = conf.translate_values().split(",")
+        for translate_value in translate_values:
+            if json_data[translate_value] == "":
+                continue
+            if translate_value == "title":
+                title_dict = json.loads(
+                    (Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
+                try:
+                    json_data[translate_value] = title_dict[number]
+                    continue
+                except:
+                    pass
+            if conf.get_translate_engine() == "azure":
+                t = translate(
+                    json_data[translate_value],
+                    target_language="zh-Hans",
+                    engine=conf.get_translate_engine(),
+                    key=conf.get_translate_key(),
+                )
+            else:
+                t = translate(json_data[translate_value])
+            if len(t):
+                json_data[translate_value] = special_characters_replacement(t)
+
+    if oCC:
+        cc_vars = conf.cc_convert_vars().split(",")
+        ccm = conf.cc_convert_mode()
+        def convert_list(mapping_data,language,vars):
+            total = []
+            for i in vars:
+                if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
+                    i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
+                total.append(i)
+            return total
+        def convert(mapping_data,language,vars):
+            if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
+                return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
+            else:
+                raise IndexError('keyword not found')
+        for cc in cc_vars:
+            if json_data[cc] == "" or len(json_data[cc]) == 0:
+                continue
+            if cc == "actor":
+                try:
+                    if ccm == 1:
+                        json_data['actor_list'] = convert_list(actor_mapping_data, "zh_cn", json_data['actor_list'])
+                        json_data['actor'] = convert(actor_mapping_data, "zh_cn", json_data['actor'])
+                    elif ccm == 2:
+                        json_data['actor_list'] = convert_list(actor_mapping_data, "zh_tw", json_data['actor_list'])
+                        json_data['actor'] = convert(actor_mapping_data, "zh_tw", json_data['actor'])
+                    elif ccm == 3:
+                        json_data['actor_list'] = convert_list(actor_mapping_data, "jp", json_data['actor_list'])
+                        json_data['actor'] = convert(actor_mapping_data, "jp", json_data['actor'])
+                except:
+                    json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
+                    json_data['actor'] = oCC.convert(json_data['actor'])
+            elif cc == "tag":
+                try:
+                    if ccm == 1:
+                        json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
+                    elif ccm == 2:
+                        json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
+                    elif ccm == 3:
+                        json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc])
+                except:
+                    json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
+            else:
+                try:
+                    if ccm == 1:
+                        json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
+                    elif ccm == 2:
+                        json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
+                    elif ccm == 3:
+                        json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
+                        json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc])
+                except IndexError:
+                    json_data[cc] = oCC.convert(json_data[cc])
+                except:
+                    pass
+
+    naming_rule=""
+    for i in conf.naming_rule().split("+"):
+        if i not in json_data:
+            naming_rule += i.strip("'").strip('"')
+        else:
+            item = json_data.get(i)
+            naming_rule += item if type(item) is not list else "&".join(item)
+
+    json_data['naming_rule'] = naming_rule
+    return json_data
+
+
+def special_characters_replacement(text) -> str:
+    if not isinstance(text, str):
+        return text
+    return (text.replace('\\', '∖').     # U+2216 SET MINUS @ Basic Multilingual Plane
+                replace('/', '∕').       # U+2215 DIVISION SLASH @ Basic Multilingual Plane
+                replace(':', '꞉').       # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
+                replace('*', '∗').       # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
+                replace('?', '？').      # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
+                replace('"', '＂').      # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
+                replace('<', 'ᐸ').       # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
+                replace('>', 'ᐳ').       # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
+                replace('|', 'ǀ').       # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+                replace('&lsquo;', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
+                replace('&rsquo;', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
+                replace('&hellip;','…').
+                replace('&amp;', '＆')
+            )