Merge pull request #555 from Suwmlee/master

Move get_data_from_json to WebCrawler
This commit is contained in:
Yoshiko2
2021-08-03 02:31:27 +08:00
committed by GitHub
3 changed files with 280 additions and 292 deletions

View File

@@ -13,19 +13,6 @@ import config
from urllib.parse import urljoin from urllib.parse import urljoin
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True
def getXpathSingle(htmlcode, xpath): def getXpathSingle(htmlcode, xpath):
html = etree.fromstring(htmlcode, etree.HTMLParser()) html = etree.fromstring(htmlcode, etree.HTMLParser())
result1 = str(html.xpath(xpath)).strip(" ['']") result1 = str(html.xpath(xpath)).strip(" ['']")

View File

@@ -0,0 +1,258 @@
import json
import re
from multiprocessing.pool import ThreadPool
import config
from ADC_function import translate
# =========website========
from . import airav
from . import avsox
from . import fanza
from . import fc2
from . import jav321
from . import javbus
from . import javdb
from . import mgstage
from . import xcity
# from . import javlib
from . import dlsite
from . import carib
from . import fc2club
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True
def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据
"""
iterate through all services and fetch the data
"""
func_mapping = {
"airav": airav.main,
"avsox": avsox.main,
"fc2": fc2.main,
"fanza": fanza.main,
"javdb": javdb.main,
"javbus": javbus.main,
"mgstage": mgstage.main,
"jav321": jav321.main,
"xcity": xcity.main,
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main
}
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 60:
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
):
sources.insert(0, sources.pop(sources.index("carib")))
elif "avsox" in sources and (re.match(r"^\d{5,}", file_number) or
"heyzo" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("javdb")))
sources.insert(1, sources.pop(sources.index("avsox")))
elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or
"siro" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("mgstage")))
elif "fc2" in sources and ("fc2" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("javdb")))
sources.insert(1, sources.pop(sources.index("fc2")))
sources.insert(2, sources.pop(sources.index("fc2club")))
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("dlsite")))
json_data = {}
if conf.multi_threading():
pool = ThreadPool(processes=len(conf.sources().split(',')))
# Set the priority of multi-thread crawling and join the multi-thread queue
for source in sources:
pool.apply_async(func_mapping[source], (file_number,))
# Get multi-threaded crawling response
for source in sources:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break
if get_data_state(json_data):
break
pool.close()
pool.terminate()
else:
for source in sources:
try:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break
if get_data_state(json_data):
break
except:
break
# Return if data not found in all sources
if not json_data:
print('[-]Movie Data not found!')
return
# ================================================网站规则添加结束================================================
title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')
if json_data.get('cover_small'):
cover_small = json_data.get('cover_small')
else:
cover_small = ''
if json_data.get('trailer'):
trailer = json_data.get('trailer')
else:
trailer = ''
if json_data.get('extrafanart'):
extrafanart = json_data.get('extrafanart')
else:
extrafanart = ''
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
print('[-]Movie Data not found!')
return
# if imagecut == '3':
# DownloadFileWithFilename()
# ====================处理异常字符====================== #\/:*?"<>|
title = title.replace('\\', '')
title = title.replace('/', '')
title = title.replace(':', '')
title = title.replace('*', '')
title = title.replace('?', '')
title = title.replace('"', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('|', '')
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# === 替换Studio片假名
studio = studio.replace('アイエナジー','Energy')
studio = studio.replace('アイデアポケット','Idea Pocket')
studio = studio.replace('アキノリ','AKNR')
studio = studio.replace('アタッカーズ','Attackers')
studio = re.sub('アパッチ.*','Apache',studio)
studio = studio.replace('アマチュアインディーズ','SOD')
studio = studio.replace('アリスJAPAN','Alice Japan')
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
studio = studio.replace('クリスタル映像','Crystal 映像')
studio = studio.replace('グローリークエスト','Glory Quest')
studio = studio.replace('ダスッ!','DAS')
studio = studio.replace('ディープス','DEEPs')
studio = studio.replace('ドグマ','Dogma')
studio = studio.replace('プレステージ','PRESTIGE')
studio = studio.replace('ムーディーズ','MOODYZ')
studio = studio.replace('メディアステーション','宇宙企画')
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
studio = studio.replace('エスワンナンバーワンスタイル','S1')
studio = studio.replace('SODクリエイト','SOD')
studio = studio.replace('サディスティックヴィレッジ','SOD')
studio = studio.replace('VRプロダクツ','VR PRODUCE')
studio = studio.replace('VRPRODUCE','VR PRODUCE')
studio = studio.replace('レアルワークス','Real Works')
studio = studio.replace('マックスエー','MAX-A')
studio = studio.replace('ピーターズMAX','PETERS MAX')
studio = studio.replace('プレミアム','PREMIUM')
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
studio = studio.replace('マキシング','MAXING')
studio = studio.replace('エムズビデオグループ','Ms Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
# 返回处理后的json_data
json_data['title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['year'] = year
json_data['actor_list'] = actor_list
json_data['trailer'] = trailer
json_data['extrafanart'] = extrafanart
if conf.is_transalte():
translate_values = conf.transalte_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
# if conf.get_transalte_engine() == "baidu":
# json_data[translate_value] = translate(
# json_data[translate_value],
# target_language="zh",
# engine=conf.get_transalte_engine(),
# app_id=conf.get_transalte_appId(),
# key=conf.get_transalte_key(),
# delay=conf.get_transalte_delay(),
# )
if conf.get_transalte_engine() == "azure":
json_data[translate_value] = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_transalte_engine(),
key=conf.get_transalte_key(),
)
else:
json_data[translate_value] = translate(json_data[translate_value])
naming_rule=""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
naming_rule += json_data.get(i)
json_data['naming_rule'] = naming_rule
return json_data

301
core.py
View File

@@ -9,24 +9,9 @@ import sys
from PIL import Image from PIL import Image
from io import BytesIO from io import BytesIO
from multiprocessing.pool import ThreadPool
from ADC_function import * from ADC_function import *
from WebCrawler import get_data_from_json
# =========website========
from WebCrawler import airav
from WebCrawler import avsox
from WebCrawler import fanza
from WebCrawler import fc2
from WebCrawler import jav321
from WebCrawler import javbus
from WebCrawler import javdb
from WebCrawler import mgstage
from WebCrawler import xcity
# from WebCrawler import javlib
from WebCrawler import dlsite
from WebCrawler import carib
from WebCrawler import fc2club
def escape_path(path, escape_literals: str): # Remove escape literals def escape_path(path, escape_literals: str): # Remove escape literals
@@ -50,257 +35,6 @@ def moveFailedFolder(filepath):
return return
def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON返回元数据
"""
iterate through all services and fetch the data
"""
func_mapping = {
"airav": airav.main,
"avsox": avsox.main,
"fc2": fc2.main,
"fanza": fanza.main,
"javdb": javdb.main,
"javbus": javbus.main,
"mgstage": mgstage.main,
"jav321": jav321.main,
"xcity": xcity.main,
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main
}
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 60:
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
):
sources.insert(0, sources.pop(sources.index("carib")))
elif "avsox" in sources and (re.match(r"^\d{5,}", file_number) or
"heyzo" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("javdb")))
sources.insert(1, sources.pop(sources.index("avsox")))
elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or
"siro" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("mgstage")))
elif "fc2" in sources and ("fc2" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("javdb")))
sources.insert(1, sources.pop(sources.index("fc2")))
sources.insert(2, sources.pop(sources.index("fc2club")))
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("dlsite")))
json_data = {}
if conf.multi_threading():
pool = ThreadPool(processes=len(conf.sources().split(',')))
# Set the priority of multi-thread crawling and join the multi-thread queue
for source in sources:
pool.apply_async(func_mapping[source], (file_number,))
# Get multi-threaded crawling response
for source in sources:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break
if get_data_state(json_data):
break
pool.close()
pool.terminate()
else:
for source in sources:
try:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break
if get_data_state(json_data):
break
except:
break
# Return if data not found in all sources
if not json_data:
print('[-]Movie Data not found!')
moveFailedFolder(filepath)
return
# ================================================网站规则添加结束================================================
title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')
if json_data.get('cover_small') == None:
cover_small = ''
else:
cover_small = json_data.get('cover_small')
if json_data.get('trailer') == None:
trailer = ''
else:
trailer = json_data.get('trailer')
if json_data.get('extrafanart') == None:
extrafanart = ''
else:
extrafanart = json_data.get('extrafanart')
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
print('[-]Movie Data not found!')
moveFailedFolder(filepath)
return
# if imagecut == '3':
# DownloadFileWithFilename()
# ====================处理异常字符====================== #\/:*?"<>|
title = title.replace('\\', '')
title = title.replace('/', '')
title = title.replace(':', '')
title = title.replace('*', '')
title = title.replace('?', '')
title = title.replace('"', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('|', '')
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# === 替换Studio片假名
studio = studio.replace('アイエナジー','Energy')
studio = studio.replace('アイデアポケット','Idea Pocket')
studio = studio.replace('アキノリ','AKNR')
studio = studio.replace('アタッカーズ','Attackers')
studio = re.sub('アパッチ.*','Apache',studio)
studio = studio.replace('アマチュアインディーズ','SOD')
studio = studio.replace('アリスJAPAN','Alice Japan')
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
studio = studio.replace('クリスタル映像','Crystal 映像')
studio = studio.replace('グローリークエスト','Glory Quest')
studio = studio.replace('ダスッ!','DAS')
studio = studio.replace('ディープス','DEEPs')
studio = studio.replace('ドグマ','Dogma')
studio = studio.replace('プレステージ','PRESTIGE')
studio = studio.replace('ムーディーズ','MOODYZ')
studio = studio.replace('メディアステーション','宇宙企画')
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
studio = studio.replace('エスワンナンバーワンスタイル','S1')
studio = studio.replace('SODクリエイト','SOD')
studio = studio.replace('サディスティックヴィレッジ','SOD')
studio = studio.replace('VRプロダクツ','VR PRODUCE')
studio = studio.replace('VRPRODUCE','VR PRODUCE')
studio = studio.replace('レアルワークス','Real Works')
studio = studio.replace('マックスエー','MAX-A')
studio = studio.replace('ピーターズMAX','PETERS MAX')
studio = studio.replace('プレミアム','PREMIUM')
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
studio = studio.replace('マキシング','MAXING')
studio = studio.replace('エムズビデオグループ','Ms Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
location_rule = eval(conf.location_rule())
if 'actor' in conf.location_rule() and len(actor) > 100:
print(conf.location_rule())
location_rule = eval(conf.location_rule().replace("actor","'多人作品'"))
maxlen = conf.max_title_len()
if 'title' in conf.location_rule() and len(title) > maxlen:
shorttitle = title[0:maxlen]
location_rule = location_rule.replace(title, shorttitle)
# 返回处理后的json_data
json_data['title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['location_rule'] = location_rule
json_data['year'] = year
json_data['actor_list'] = actor_list
if conf.is_transalte():
translate_values = conf.transalte_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
# if conf.get_transalte_engine() == "baidu":
# json_data[translate_value] = translate(
# json_data[translate_value],
# target_language="zh",
# engine=conf.get_transalte_engine(),
# app_id=conf.get_transalte_appId(),
# key=conf.get_transalte_key(),
# delay=conf.get_transalte_delay(),
# )
if conf.get_transalte_engine() == "azure":
json_data[translate_value] = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_transalte_engine(),
key=conf.get_transalte_key(),
)
else:
json_data[translate_value] = translate(json_data[translate_value])
if conf.is_trailer():
if trailer:
json_data['trailer'] = trailer
else:
json_data['trailer'] = ''
else:
json_data['trailer'] = ''
if conf.is_extrafanart():
if extrafanart:
json_data['extrafanart'] = extrafanart
else:
json_data['extrafanart'] = ''
else:
json_data['extrafanart'] = ''
naming_rule=""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
naming_rule += json_data.get(i)
json_data['naming_rule'] = naming_rule
return json_data
def get_info(json_data): # 返回json里的数据 def get_info(json_data): # 返回json里的数据
title = json_data.get('title') title = json_data.get('title')
studio = json_data.get('studio') studio = json_data.get('studio')
@@ -324,12 +58,20 @@ def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config
print('[+]Image Downloaded! ' + path + '/' + number + leak_word + c_word + '-poster.jpg') print('[+]Image Downloaded! ' + path + '/' + number + leak_word + c_word + '-poster.jpg')
def create_folder(success_folder, location_rule, json_data, conf: config.Config): # 创建文件夹 def create_folder(json_data, conf: config.Config): # 创建文件夹
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
if len(location_rule) > 240: # 新建成功输出文件夹 success_folder = conf.success_folder()
path = success_folder + '/' + location_rule.replace("'actor'", "'manypeople'", 3).replace("actor","'manypeople'",3) # path为影片+元数据所在目录 actor = json_data.get('actor')
else: location_rule = eval(conf.location_rule(), json_data)
path = success_folder + '/' + location_rule if 'actor' in conf.location_rule() and len(actor) > 100:
print(conf.location_rule())
location_rule = eval(conf.location_rule().replace("actor","'多人作品'"), json_data)
maxlen = conf.max_title_len()
if 'title' in conf.location_rule() and len(title) > maxlen:
shorttitle = title[0:maxlen]
location_rule = location_rule.replace(title, shorttitle)
path = success_folder + '/' + location_rule
path = trimblank(path) path = trimblank(path)
if not os.path.exists(path): if not os.path.exists(path):
path = escape_path(path, conf.escape_literals()) path = escape_path(path, conf.escape_literals())
@@ -740,10 +482,11 @@ def core_main(file_path, number_th, conf: config.Config):
# 下面被注释的变量不需要 # 下面被注释的变量不需要
#rootpath= os.getcwd #rootpath= os.getcwd
number = number_th number = number_th
json_data = get_data_from_json(number, filepath, conf) # 定义番号 json_data = get_data_from_json(number, conf) # 定义番号
# Return if blank dict returned (data not found) # Return if blank dict returned (data not found)
if not json_data: if not json_data:
moveFailedFolder(filepath)
return return
if json_data["number"] != number: if json_data["number"] != number:
@@ -790,7 +533,7 @@ def core_main(file_path, number_th, conf: config.Config):
# 3不改变路径刮削 # 3不改变路径刮削
if conf.main_mode() == 1: if conf.main_mode() == 1:
# 创建文件夹 # 创建文件夹
path = create_folder(conf.success_folder(), json_data.get('location_rule'), json_data, conf) path = create_folder(json_data, conf)
if multi_part == 1: if multi_part == 1:
number += part # 这时number会被附加上CD1后缀 number += part # 这时number会被附加上CD1后缀
@@ -804,13 +547,13 @@ def core_main(file_path, number_th, conf: config.Config):
if not multi_part or part.lower() == '-cd1': if not multi_part or part.lower() == '-cd1':
try: try:
# 下载预告片 # 下载预告片
if json_data.get('trailer'): if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
except: except:
pass pass
try: try:
# 下载剧照 data, path, conf: config.Config, filepath # 下载剧照 data, path, conf: config.Config, filepath
if json_data.get('extrafanart'): if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
except: except:
pass pass
@@ -831,7 +574,7 @@ def core_main(file_path, number_th, conf: config.Config):
elif conf.main_mode() == 2: elif conf.main_mode() == 2:
# 创建文件夹 # 创建文件夹
path = create_folder(conf.success_folder(), json_data.get('location_rule'), json_data, conf) path = create_folder(json_data, conf)
# 移动文件 # 移动文件
paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf) paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf)
poster_path = path + '/' + number + leak_word + c_word + '-poster.jpg' poster_path = path + '/' + number + leak_word + c_word + '-poster.jpg'
@@ -854,11 +597,11 @@ def core_main(file_path, number_th, conf: config.Config):
if not multi_part or part.lower() == '-cd1': if not multi_part or part.lower() == '-cd1':
# 下载预告片 # 下载预告片
if json_data.get('trailer'): if conf.is_trailer() and json_data.get('trailer'):
trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
# 下载剧照 data, path, conf: config.Config, filepath # 下载剧照 data, path, conf: config.Config, filepath
if json_data.get('extrafanart'): if conf.is_extrafanart() and json_data.get('extrafanart'):
extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
# 裁剪图 # 裁剪图