import json import re from multiprocessing.pool import ThreadPool import config from ADC_function import translate # =========website======== from . import airav from . import avsox from . import fanza from . import fc2 from . import jav321 from . import javbus from . import javdb from . import mgstage from . import xcity # from . import javlib from . import dlsite from . import carib from . import fc2club def get_data_state(data: dict) -> bool: # 元数据获取失败检测 if "title" not in data or "number" not in data: return False if data["title"] is None or data["title"] == "" or data["title"] == "null": return False if data["number"] is None or data["number"] == "" or data["number"] == "null": return False return True def get_data_from_json(file_number): # 从JSON返回元数据 """ iterate through all services and fetch the data """ func_mapping = { "airav": airav.main, "avsox": avsox.main, "fc2": fc2.main, "fanza": fanza.main, "javdb": javdb.main, "javbus": javbus.main, "mgstage": mgstage.main, "jav321": jav321.main, "xcity": xcity.main, # "javlib": javlib.main, "dlsite": dlsite.main, "carib": carib.main, "fc2club": fc2club.main } conf = config.getInstance() # default fetch order list, from the beginning to the end sources = conf.sources().split(',') if not len(conf.sources()) > 80: # if the input file name matches certain rules, # move some web service to the beginning of the list lo_file_number = file_number.lower() if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number) ): sources.insert(0, sources.pop(sources.index("carib"))) elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number: if "javdb" in sources: sources.insert(0, sources.pop(sources.index("javdb"))) if "avsox" in sources: sources.insert(0, sources.pop(sources.index("avsox"))) elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or "siro" in lo_file_number ): sources.insert(0, sources.pop(sources.index("mgstage"))) elif "fc2" in lo_file_number: if "javdb" in sources: sources.insert(0, sources.pop(sources.index("javdb"))) if "fc2" in sources: sources.insert(0, sources.pop(sources.index("fc2"))) if "fc2club" in sources: sources.insert(0, sources.pop(sources.index("fc2club"))) elif "dlsite" in sources and ( "rj" in lo_file_number or "vj" in lo_file_number ): sources.insert(0, sources.pop(sources.index("dlsite"))) elif re.match(r"^[a-z0-9]{3,}$", lo_file_number): if "javdb" in sources: sources.insert(0, sources.pop(sources.index("javdb"))) if "xcity" in sources: sources.insert(0, sources.pop(sources.index("xcity"))) # check sources in func_mapping todel = [] for s in sources: if not s in func_mapping: print('[!] Source Not Exist : ' + s) todel.append(s) for d in todel: print('[!] Remove Source : ' + s) sources.remove(d) json_data = {} if conf.multi_threading(): pool = ThreadPool(processes=len(conf.sources().split(','))) # Set the priority of multi-thread crawling and join the multi-thread queue for source in sources: pool.apply_async(func_mapping[source], (file_number,)) # Get multi-threaded crawling response for source in sources: if conf.debug() == True: print('[+]select', source) json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) # if any service return a valid return, break if get_data_state(json_data): print(f"[+]Find movie [{file_number}] metadata on website '{source}'") break pool.close() pool.terminate() else: for source in sources: try: if conf.debug() == True: print('[+]select', source) json_data = json.loads(func_mapping[source](file_number)) # if any service return a valid return, break if get_data_state(json_data): print(f"[+]Find movie [{file_number}] metadata on website '{source}'") break except: break # Return if data not found in all sources if not json_data: print('[-]Movie Number not found!') return None # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障 # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。 if str(json_data.get('number')).upper() != file_number.upper(): print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number')))) return None # ================================================网站规则添加结束================================================ title = json_data.get('title') actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表 actor_list = [actor.strip() for actor in actor_list] # 去除空白 director = json_data.get('director') release = json_data.get('release') number = json_data.get('number') studio = json_data.get('studio') source = json_data.get('source') runtime = json_data.get('runtime') outline = json_data.get('outline') label = json_data.get('label') series = json_data.get('series') year = json_data.get('year') if json_data.get('cover_small'): cover_small = json_data.get('cover_small') else: cover_small = '' if json_data.get('trailer'): trailer = json_data.get('trailer') else: trailer = '' if json_data.get('extrafanart'): extrafanart = json_data.get('extrafanart') else: extrafanart = '' imagecut = json_data.get('imagecut') tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ while 'XXXX' in tag: tag.remove('XXXX') while 'xxx' in tag: tag.remove('xxx') actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') if title == '' or number == '': print('[-]Movie Number or Title not found!') return None # if imagecut == '3': # DownloadFileWithFilename() # ====================处理异常字符====================== #\/:*?"<>| actor = special_characters_replacement(actor) actor_list = [special_characters_replacement(a) for a in actor_list] title = special_characters_replacement(title) label = special_characters_replacement(label) outline = special_characters_replacement(outline) series = special_characters_replacement(series) studio = special_characters_replacement(studio) director = special_characters_replacement(director) tag = [special_characters_replacement(t) for t in tag] release = release.replace('/', '-') tmpArr = cover_small.split(',') if len(tmpArr) > 0: cover_small = tmpArr[0].strip('\"').strip('\'') # ====================处理异常字符 END================== #\/:*?"<>| # === 替换Studio片假名 studio = studio.replace('アイエナジー','Energy') studio = studio.replace('アイデアポケット','Idea Pocket') studio = studio.replace('アキノリ','AKNR') studio = studio.replace('アタッカーズ','Attackers') studio = re.sub('アパッチ.*','Apache',studio) studio = studio.replace('アマチュアインディーズ','SOD') studio = studio.replace('アリスJAPAN','Alice Japan') studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex') studio = studio.replace('クリスタル映像','Crystal 映像') studio = studio.replace('グローリークエスト','Glory Quest') studio = studio.replace('ダスッ!','DAS!') studio = studio.replace('ディープス','DEEP’s') studio = studio.replace('ドグマ','Dogma') studio = studio.replace('プレステージ','PRESTIGE') studio = studio.replace('ムーディーズ','MOODYZ') studio = studio.replace('メディアステーション','宇宙企画') studio = studio.replace('ワンズファクトリー','WANZ FACTORY') studio = studio.replace('エスワン ナンバーワンスタイル','S1') studio = studio.replace('エスワンナンバーワンスタイル','S1') studio = studio.replace('SODクリエイト','SOD') studio = studio.replace('サディスティックヴィレッジ','SOD') studio = studio.replace('V&Rプロダクツ','V&R PRODUCE') studio = studio.replace('V&RPRODUCE','V&R PRODUCE') studio = studio.replace('レアルワークス','Real Works') studio = studio.replace('マックスエー','MAX-A') studio = studio.replace('ピーターズMAX','PETERS MAX') studio = studio.replace('プレミアム','PREMIUM') studio = studio.replace('ナチュラルハイ','NATURAL HIGH') studio = studio.replace('マキシング','MAXING') studio = studio.replace('エムズビデオグループ','M’s Video Group') studio = studio.replace('ミニマム','Minimum') studio = studio.replace('ワープエンタテインメント','WAAP Entertainment') studio = studio.replace('pacopacomama,パコパコママ','pacopacomama') studio = studio.replace('パコパコママ','pacopacomama') studio = re.sub('.*/妄想族','妄想族',studio) studio = studio.replace('/',' ') # === 替换Studio片假名 END # 返回处理后的json_data json_data['title'] = title json_data['original_title'] = title json_data['actor'] = actor json_data['release'] = release json_data['cover_small'] = cover_small json_data['tag'] = tag json_data['year'] = year json_data['actor_list'] = actor_list json_data['trailer'] = trailer json_data['extrafanart'] = extrafanart json_data['label'] = label json_data['outline'] = outline json_data['series'] = series json_data['studio'] = studio json_data['director'] = director if conf.is_transalte(): translate_values = conf.transalte_values().split(",") for translate_value in translate_values: if json_data[translate_value] == "": continue t = "" # if conf.get_transalte_engine() == "baidu": # json_data[translate_value] = translate( # json_data[translate_value], # target_language="zh", # engine=conf.get_transalte_engine(), # app_id=conf.get_transalte_appId(), # key=conf.get_transalte_key(), # delay=conf.get_transalte_delay(), # ) if conf.get_transalte_engine() == "azure": t = translate( json_data[translate_value], target_language="zh-Hans", engine=conf.get_transalte_engine(), key=conf.get_transalte_key(), ) else: t = translate(json_data[translate_value]) if len(t): json_data[translate_value] = special_characters_replacement(t) naming_rule="" for i in conf.naming_rule().split("+"): if i not in json_data: naming_rule += i.strip("'").strip('"') else: naming_rule += json_data.get(i) json_data['naming_rule'] = naming_rule return json_data def special_characters_replacement(text) -> str: if not isinstance(text, str): return text return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK replace('&', '&'))