Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix

* ruff format --line-length 120 --target-version py39

* Change the link for G2PW Model

* update pytorch version and colab
This commit is contained in:
XXXXRT666
2025-04-07 09:42:47 +01:00
committed by GitHub
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions

View File

@@ -1,28 +1,31 @@
# This code is modified from https://github.com/ZFTurbo/
import librosa
from tqdm import tqdm
import os
import torch
import warnings
import librosa
import numpy as np
import soundfile as sf
import torch
import torch.nn as nn
import yaml
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
class Roformer_Loader:
def get_config(self, config_path):
with open(config_path, 'r', encoding='utf-8') as f:
with open(config_path, "r", encoding="utf-8") as f:
# use fullloader to load tag !!python/tuple, code can be improved
config = yaml.load(f, Loader=yaml.FullLoader)
return config
def get_default_config(self):
default_config = None
if self.model_type == 'bs_roformer':
if self.model_type == "bs_roformer":
# Use model_bs_roformer_ep_368_sdr_12.9628.yaml and model_bs_roformer_ep_317_sdr_12.9755.yaml as default configuration files
# Other BS_Roformer models may not be compatible
# fmt: off
default_config = {
"audio": {"chunk_size": 352800, "sample_rate": 44100},
"model": {
@@ -51,9 +54,10 @@ class Roformer_Loader:
"multi_stft_normalized": False,
},
"training": {"instruments": ["vocals", "other"], "target_instrument": "vocals"},
"inference": {"batch_size": 2, "num_overlap": 2}
"inference": {"batch_size": 2, "num_overlap": 2},
}
elif self.model_type == 'mel_band_roformer':
# fmt: on
elif self.model_type == "mel_band_roformer":
# Use model_mel_band_roformer_ep_3005_sdr_11.4360.yaml as default configuration files
# Other Mel_Band_Roformer models may not be compatible
default_config = {
@@ -82,29 +86,30 @@ class Roformer_Loader:
"multi_stft_resolution_loss_weight": 1.0,
"multi_stft_resolutions_window_sizes": (4096, 2048, 1024, 512, 256),
"multi_stft_hop_size": 147,
"multi_stft_normalized": False
"multi_stft_normalized": False,
},
"training": {"instruments": ["vocals", "other"], "target_instrument": "vocals"},
"inference": {"batch_size": 2, "num_overlap": 2}
"inference": {"batch_size": 2, "num_overlap": 2},
}
return default_config
def get_model_from_config(self):
if self.model_type == 'bs_roformer':
if self.model_type == "bs_roformer":
from bs_roformer.bs_roformer import BSRoformer
model = BSRoformer(**dict(self.config["model"]))
elif self.model_type == 'mel_band_roformer':
elif self.model_type == "mel_band_roformer":
from bs_roformer.mel_band_roformer import MelBandRoformer
model = MelBandRoformer(**dict(self.config["model"]))
else:
print('Error: Unknown model: {}'.format(self.model_type))
print("Error: Unknown model: {}".format(self.model_type))
model = None
return model
def demix_track(self, model, mix, device):
C = self.config["audio"]["chunk_size"] # chunk_size
C = self.config["audio"]["chunk_size"] # chunk_size
N = self.config["inference"]["num_overlap"]
fade_size = C // 10
step = int(C // N)
@@ -116,7 +121,7 @@ class Roformer_Loader:
# Do pad from the beginning and end to account floating window results better
if length_init > 2 * border and (border > 0):
mix = nn.functional.pad(mix, (border, border), mode='reflect')
mix = nn.functional.pad(mix, (border, border), mode="reflect")
# Prepare windows arrays (do 1 time for speed up). This trick repairs click problems on the edges of segment
window_size = C
@@ -125,17 +130,17 @@ class Roformer_Loader:
window_start = torch.ones(window_size)
window_middle = torch.ones(window_size)
window_finish = torch.ones(window_size)
window_start[-fade_size:] *= fadeout # First audio chunk, no fadein
window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout
window_start[-fade_size:] *= fadeout # First audio chunk, no fadein
window_finish[:fade_size] *= fadein # Last audio chunk, no fadeout
window_middle[-fade_size:] *= fadeout
window_middle[:fade_size] *= fadein
with torch.amp.autocast('cuda'):
with torch.amp.autocast("cuda"):
with torch.inference_mode():
if self.config["training"]["target_instrument"] is None:
req_shape = (len(self.config["training"]["instruments"]),) + tuple(mix.shape)
else:
req_shape = (1, ) + tuple(mix.shape)
req_shape = (1,) + tuple(mix.shape)
result = torch.zeros(req_shape, dtype=torch.float32)
counter = torch.zeros(req_shape, dtype=torch.float32)
@@ -143,15 +148,15 @@ class Roformer_Loader:
batch_data = []
batch_locations = []
while i < mix.shape[1]:
part = mix[:, i:i + C].to(device)
part = mix[:, i : i + C].to(device)
length = part.shape[-1]
if length < C:
if length > C // 2 + 1:
part = nn.functional.pad(input=part, pad=(0, C - length), mode='reflect')
part = nn.functional.pad(input=part, pad=(0, C - length), mode="reflect")
else:
part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode='constant', value=0)
part = nn.functional.pad(input=part, pad=(0, C - length, 0, 0), mode="constant", value=0)
if self.is_half:
part=part.half()
part = part.half()
batch_data.append(part)
batch_locations.append((i, length))
i += step
@@ -170,8 +175,8 @@ class Roformer_Loader:
for j in range(len(batch_locations)):
start, l = batch_locations[j]
result[..., start:start+l] += x[j][..., :l].cpu() * window[..., :l]
counter[..., start:start+l] += window[..., :l]
result[..., start : start + l] += x[j][..., :l].cpu() * window[..., :l]
counter[..., start : start + l] += window[..., :l]
batch_data = []
batch_locations = []
@@ -191,7 +196,6 @@ class Roformer_Loader:
else:
return {k: v for k, v in zip([self.config["training"]["target_instrument"]], estimated_sources)}
def run_folder(self, input, vocal_root, others_root, format):
self.model.eval()
path = input
@@ -200,20 +204,20 @@ class Roformer_Loader:
file_base_name = os.path.splitext(os.path.basename(path))[0]
sample_rate = 44100
if 'sample_rate' in self.config["audio"]:
sample_rate = self.config["audio"]['sample_rate']
if "sample_rate" in self.config["audio"]:
sample_rate = self.config["audio"]["sample_rate"]
try:
mix, sr = librosa.load(path, sr=sample_rate, mono=False)
except Exception as e:
print('Can read track: {}'.format(path))
print('Error message: {}'.format(str(e)))
print("Can read track: {}".format(path))
print("Error message: {}".format(str(e)))
return
# in case if model only supports mono tracks
isstereo = self.config["model"].get("stereo", True)
if not isstereo and len(mix.shape) != 1:
mix = np.mean(mix, axis=0) # if more than 2 channels, take mean
mix = np.mean(mix, axis=0) # if more than 2 channels, take mean
print("Warning: Track has more than 1 channels, but model is mono, taking mean of all channels.")
mix_orig = mix.copy()
@@ -226,7 +230,7 @@ class Roformer_Loader:
# other instruments are caculated by subtracting target instrument from mixture
target_instrument = self.config["training"]["target_instrument"]
other_instruments = [i for i in self.config["training"]["instruments"] if i != target_instrument]
other = mix_orig - res[target_instrument] # caculate other instruments
other = mix_orig - res[target_instrument] # caculate other instruments
path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, target_instrument)
path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other_instruments[0])
@@ -237,11 +241,10 @@ class Roformer_Loader:
vocal_inst = self.config["training"]["instruments"][0]
path_vocal = "{}/{}_{}.wav".format(vocal_root, file_base_name, vocal_inst)
self.save_audio(path_vocal, res[vocal_inst].T, sr, format)
for other in self.config["training"]["instruments"][1:]: # save other instruments
for other in self.config["training"]["instruments"][1:]: # save other instruments
path_other = "{}/{}_{}.wav".format(others_root, file_base_name, other)
self.save_audio(path_other, res[other].T, sr, format)
def save_audio(self, path, data, sr, format):
# input path should be endwith '.wav'
if format in ["wav", "flac"]:
@@ -250,10 +253,11 @@ class Roformer_Loader:
sf.write(path, data, sr)
else:
sf.write(path, data, sr)
os.system("ffmpeg -i \"{}\" -vn \"{}\" -q:a 2 -y".format(path, path[:-3] + format))
try: os.remove(path)
except: pass
os.system('ffmpeg -i "{}" -vn "{}" -q:a 2 -y'.format(path, path[:-3] + format))
try:
os.remove(path)
except:
pass
def __init__(self, model_path, config_path, device, is_half):
self.device = device
@@ -270,7 +274,9 @@ class Roformer_Loader:
if not os.path.exists(config_path):
if self.model_type is None:
# if model_type is still None, raise an error
raise ValueError("Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again.")
raise ValueError(
"Error: Unknown model type. If you are using a model without a configuration file, Ensure that your model name includes 'bs_roformer', 'bsroformer', 'mel_band_roformer', or 'melbandroformer'. Otherwise, you can manually place the model configuration file into 'tools/uvr5/uvr5w_weights' and ensure that the configuration file is named as '<model_name>.yaml' then try it again."
)
self.config = self.get_default_config()
else:
# if there is a configuration file
@@ -289,12 +295,10 @@ class Roformer_Loader:
state_dict = torch.load(model_path, map_location="cpu")
model.load_state_dict(state_dict)
if(is_half==False):
if is_half == False:
self.model = model.to(device)
else:
self.model = model.half().to(device)
def _path_audio_(self, input, others_root, vocal_root, format, is_hp3=False):
self.run_folder(input, vocal_root, others_root, format)