diff --git a/core.py b/core.py index 515ea4c..29df3a7 100644 --- a/core.py +++ b/core.py @@ -14,7 +14,8 @@ from datetime import datetime from lxml import etree from ADC_function import * -from WebCrawler import get_data_from_json +# from WebCrawler import get_data_from_json +from scraper import get_data_from_json from number_parser import is_uncensored from ImageProcessing import cutImage diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..a54b936 --- /dev/null +++ b/scraper.py @@ -0,0 +1,386 @@ +import json +import re +from multiprocessing.pool import ThreadPool + +import ADC_function +import config +from ADC_function import translate +from lxml import etree +from pathlib import Path + +from scrapinglib.api import search + +# =========website======== +# from . import airav +# from . import avsox +# from . import fanza +# from . import fc2 +# from . import jav321 +# from . import javbus +# from . import javdb +# from . import mgstage +# from . import xcity +# # from . import javlib +# from . import dlsite +# from . import carib +# from . import fc2club +# from . import mv91 +# from . import madou +# from . import gcolle +# from . import getchu + + +# def get_data_state(data: dict) -> bool: # 元数据获取失败检测 +# if "title" not in data or "number" not in data: +# return False + +# if data["title"] is None or data["title"] == "" or data["title"] == "null": +# return False + +# if data["number"] is None or data["number"] == "" or data["number"] == "null": +# return False + +# return True + + +def get_data_from_json(file_number, oCC): + """ + iterate through all services and fetch the data 从JSON返回元数据 + """ + + actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml')) + info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml')) + + # func_mapping = { + # "airav": airav.main, + # "avsox": avsox.main, + # "fc2": fc2.main, + # "fanza": fanza.main, + # "javdb": javdb.main, + # "javbus": javbus.main, + # "mgstage": mgstage.main, + # "jav321": jav321.main, + # "xcity": xcity.main, + # # "javlib": javlib.main, + # "dlsite": dlsite.main, + # "carib": carib.main, + # "fc2club": fc2club.main, + # "mv91": mv91.main, + # "madou": madou.main, + # "gcolle": gcolle.main, + # "getchu": getchu.main, + # } + + conf = config.getInstance() + # default fetch order list, from the beginning to the end + sources = conf.sources().split(',') + # def insert(sources,source): + # if source in sources: + # sources.insert(0, sources.pop(sources.index(source))) + # return sources + + # if len(sources) <= len(func_mapping): + # # if the input file name matches certain rules, + # # move some web service to the beginning of the list + # lo_file_number = file_number.lower() + # if "carib" in sources and (re.search(r"^\d{6}-\d{3}", file_number) + # ): + # sources = insert(sources,"carib") + # elif "item" in file_number or "GETCHU" in file_number.upper(): + # sources = insert(sources,"getchu") + # elif "rj" in lo_file_number or "vj" in lo_file_number or re.search(r"[\u3040-\u309F\u30A0-\u30FF]+", file_number): + # sources = insert(sources, "getchu") + # sources = insert(sources, "dlsite") + # elif re.search(r"^\d{5,}", file_number) or "heyzo" in lo_file_number: + # if "avsox" in sources: + # sources = insert(sources,"avsox") + # elif "mgstage" in sources and \ + # (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number): + # sources = insert(sources,"mgstage") + # elif "fc2" in lo_file_number: + # if "fc2" in sources: + # sources = insert(sources,"fc2") + # elif "gcolle" in sources and (re.search("\d{6}", file_number)): + # sources = insert(sources,"gcolle") + # elif re.search(r"^[a-z0-9]{3,}$", lo_file_number): + # if "xcity" in sources: + # sources = insert(sources,"xcity") + # if "madou" in sources: + # sources = insert(sources,"madou") + # elif "madou" in sources and ( + # re.search(r"^[a-z0-9]{3,}-[0-9]{1,}$", lo_file_number) + # ): + # sources = insert(sources,"madou") + + # # check sources in func_mapping + # todel = [] + # for s in sources: + # if not s in func_mapping: + # print('[!] Source Not Exist : ' + s) + # todel.append(s) + # for d in todel: + # print('[!] Remove Source : ' + s) + # sources.remove(d) + + # json_data = {} + + # if conf.multi_threading(): + # pool = ThreadPool(processes=len(conf.sources().split(','))) + + # # Set the priority of multi-thread crawling and join the multi-thread queue + # for source in sources: + # pool.apply_async(func_mapping[source], (file_number,)) + + # # Get multi-threaded crawling response + # for source in sources: + # if conf.debug() == True: + # print('[+]select', source) + # try: + # json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) + # except: + # json_data = pool.apply_async(func_mapping[source], (file_number,)).get() + # # if any service return a valid return, break + # if get_data_state(json_data): + # print(f"[+]Find movie [{file_number}] metadata on website '{source}'") + # break + # pool.close() + # pool.terminate() + # else: + # for source in sources: + # try: + # if conf.debug() == True: + # print('[+]select', source) + # try: + # json_data = json.loads(func_mapping[source](file_number)) + # except: + # json_data = func_mapping[source](file_number) + # # if any service return a valid return, break + # if get_data_state(json_data): + # print(f"[+]Find movie [{file_number}] metadata on website '{source}'") + # break + # except: + # break + + # TODO 准备参数 + # 1. javdb 的额外参数,cookies及sites区分 + # 2. storyline sites参数 + # 3. getchu仍在变更,未添加 + # 4. 清理 ADC_function, webcrawler + # 5. ...... + proxies = None + configProxy = conf.proxy() + if configProxy.enable: + proxies = configProxy.proxies() + + json_data = search(file_number, sources, proxies=proxies) + # Return if data not found in all sources + if not json_data: + print('[-]Movie Number not found!') + return None + + # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障 + # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z + # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z + # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。 + if str(json_data.get('number')).upper() != file_number.upper(): + try: + if json_data.get('allow_number_change'): + pass + except: + print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number')))) + return None + + # ================================================网站规则添加结束================================================ + + title = json_data.get('title') + actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表 + actor_list = [actor.strip() for actor in actor_list] # 去除空白 + director = json_data.get('director') + release = json_data.get('release') + number = json_data.get('number') + studio = json_data.get('studio') + source = json_data.get('source') + runtime = json_data.get('runtime') + outline = json_data.get('outline') + label = json_data.get('label') + series = json_data.get('series') + year = json_data.get('year') + + if json_data.get('cover_small'): + cover_small = json_data.get('cover_small') + else: + cover_small = '' + + if json_data.get('trailer'): + trailer = json_data.get('trailer') + else: + trailer = '' + + if json_data.get('extrafanart'): + extrafanart = json_data.get('extrafanart') + else: + extrafanart = '' + + imagecut = json_data.get('imagecut') + tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ + while 'XXXX' in tag: + tag.remove('XXXX') + while 'xxx' in tag: + tag.remove('xxx') + actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') + + if title == '' or number == '': + print('[-]Movie Number or Title not found!') + return None + + # if imagecut == '3': + # DownloadFileWithFilename() + + # ====================处理异常字符====================== #\/:*?"<>| + actor = special_characters_replacement(actor) + actor_list = [special_characters_replacement(a) for a in actor_list] + title = special_characters_replacement(title) + label = special_characters_replacement(label) + outline = special_characters_replacement(outline) + series = special_characters_replacement(series) + studio = special_characters_replacement(studio) + director = special_characters_replacement(director) + tag = [special_characters_replacement(t) for t in tag] + release = release.replace('/', '-') + tmpArr = cover_small.split(',') + if len(tmpArr) > 0: + cover_small = tmpArr[0].strip('\"').strip('\'') + # ====================处理异常字符 END================== #\/:*?"<>| + + # 返回处理后的json_data + json_data['title'] = title + json_data['original_title'] = title + json_data['actor'] = actor + json_data['release'] = release + json_data['cover_small'] = cover_small + json_data['tag'] = tag + json_data['year'] = year + json_data['actor_list'] = actor_list + json_data['trailer'] = trailer + json_data['extrafanart'] = extrafanart + json_data['label'] = label + json_data['outline'] = outline + json_data['series'] = series + json_data['studio'] = studio + json_data['director'] = director + + if conf.is_translate(): + translate_values = conf.translate_values().split(",") + for translate_value in translate_values: + if json_data[translate_value] == "": + continue + if translate_value == "title": + title_dict = json.loads( + (Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8")) + try: + json_data[translate_value] = title_dict[number] + continue + except: + pass + if conf.get_translate_engine() == "azure": + t = translate( + json_data[translate_value], + target_language="zh-Hans", + engine=conf.get_translate_engine(), + key=conf.get_translate_key(), + ) + else: + t = translate(json_data[translate_value]) + if len(t): + json_data[translate_value] = special_characters_replacement(t) + + if oCC: + cc_vars = conf.cc_convert_vars().split(",") + ccm = conf.cc_convert_mode() + def convert_list(mapping_data,language,vars): + total = [] + for i in vars: + if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0: + i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0] + total.append(i) + return total + def convert(mapping_data,language,vars): + if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0: + return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0] + else: + raise IndexError('keyword not found') + for cc in cc_vars: + if json_data[cc] == "" or len(json_data[cc]) == 0: + continue + if cc == "actor": + try: + if ccm == 1: + json_data['actor_list'] = convert_list(actor_mapping_data, "zh_cn", json_data['actor_list']) + json_data['actor'] = convert(actor_mapping_data, "zh_cn", json_data['actor']) + elif ccm == 2: + json_data['actor_list'] = convert_list(actor_mapping_data, "zh_tw", json_data['actor_list']) + json_data['actor'] = convert(actor_mapping_data, "zh_tw", json_data['actor']) + elif ccm == 3: + json_data['actor_list'] = convert_list(actor_mapping_data, "jp", json_data['actor_list']) + json_data['actor'] = convert(actor_mapping_data, "jp", json_data['actor']) + except: + json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']] + json_data['actor'] = oCC.convert(json_data['actor']) + elif cc == "tag": + try: + if ccm == 1: + json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) + elif ccm == 2: + json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) + elif ccm == 3: + json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_list("删除", json_data[cc]) + except: + json_data[cc] = [oCC.convert(t) for t in json_data[cc]] + else: + try: + if ccm == 1: + json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) + elif ccm == 2: + json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) + elif ccm == 3: + json_data[cc] = convert(info_mapping_data, "jp", json_data[cc]) + json_data[cc] = ADC_function.delete_all_elements_in_str("删除", json_data[cc]) + except IndexError: + json_data[cc] = oCC.convert(json_data[cc]) + except: + pass + + naming_rule="" + for i in conf.naming_rule().split("+"): + if i not in json_data: + naming_rule += i.strip("'").strip('"') + else: + item = json_data.get(i) + naming_rule += item if type(item) is not list else "&".join(item) + + json_data['naming_rule'] = naming_rule + return json_data + + +def special_characters_replacement(text) -> str: + if not isinstance(text, str): + return text + return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane + replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane + replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D + replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane + replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane + replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane + replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane + replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane + replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK + replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK + replace('…','…'). + replace('&', '&') + )