Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)
* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
This commit is contained in:
@@ -1,24 +1,18 @@
|
||||
import time
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import traceback
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.data
|
||||
from tqdm import tqdm
|
||||
|
||||
from module import commons
|
||||
from module.mel_processing import spectrogram_torch,spec_to_mel_torch
|
||||
from module.mel_processing import spectrogram_torch, spec_to_mel_torch
|
||||
from text import cleaned_text_to_sequence
|
||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||
import torch.nn.functional as F
|
||||
from functools import lru_cache
|
||||
import requests
|
||||
from scipy.io import wavfile
|
||||
from io import BytesIO
|
||||
from tools.my_utils import load_audio
|
||||
version = os.environ.get('version',None)
|
||||
|
||||
version = os.environ.get("version", None)
|
||||
|
||||
|
||||
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
|
||||
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
"""
|
||||
@@ -43,7 +37,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
|
||||
for line in lines:
|
||||
tmp = line.split("\t")
|
||||
if (len(tmp) != 4):
|
||||
if len(tmp) != 4:
|
||||
continue
|
||||
self.phoneme_data[tmp[0]] = [tmp[1]]
|
||||
|
||||
@@ -51,7 +45,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
tmp = self.audiopaths_sid_text
|
||||
leng = len(tmp)
|
||||
min_num = 100
|
||||
if (leng < min_num):
|
||||
if leng < min_num:
|
||||
self.audiopaths_sid_text = []
|
||||
for _ in range(max(2, int(min_num / leng))):
|
||||
self.audiopaths_sid_text += tmp
|
||||
@@ -76,7 +70,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
for audiopath in tqdm(self.audiopaths_sid_text):
|
||||
try:
|
||||
phoneme = self.phoneme_data[audiopath][0]
|
||||
phoneme = phoneme.split(' ')
|
||||
phoneme = phoneme.split(" ")
|
||||
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
|
||||
except Exception:
|
||||
print(f"{audiopath} not in self.phoneme_data !")
|
||||
@@ -111,7 +105,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
|
||||
with torch.no_grad():
|
||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
||||
if (ssl.shape[-1] != spec.shape[-1]):
|
||||
if ssl.shape[-1] != spec.shape[-1]:
|
||||
typee = ssl.dtype
|
||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||
ssl.requires_grad = False
|
||||
@@ -129,8 +123,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
audio = torch.FloatTensor(audio_array) # /32768
|
||||
audio_norm = audio
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = spectrogram_torch(
|
||||
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
|
||||
)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
return spec, audio_norm
|
||||
|
||||
@@ -146,12 +141,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
return len(self.audiopaths_sid_text)
|
||||
|
||||
def random_slice(self, ssl, wav, mel):
|
||||
assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, (
|
||||
"first", ssl.shape, wav.shape)
|
||||
assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ("first", ssl.shape, wav.shape)
|
||||
|
||||
len_mel = mel.shape[1]
|
||||
if self.val:
|
||||
reference_mel = mel[:, :len_mel // 3]
|
||||
reference_mel = mel[:, : len_mel // 3]
|
||||
return reference_mel, ssl, wav, mel
|
||||
dir = random.randint(0, 1)
|
||||
sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2))
|
||||
@@ -159,20 +153,29 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
if dir == 0:
|
||||
reference_mel = mel[:, :sep_point]
|
||||
ssl = ssl[:, :, sep_point:]
|
||||
wav2 = wav[:, sep_point * self.hop_length:]
|
||||
wav2 = wav[:, sep_point * self.hop_length :]
|
||||
mel = mel[:, sep_point:]
|
||||
else:
|
||||
reference_mel = mel[:, sep_point:]
|
||||
ssl = ssl[:, :, :sep_point]
|
||||
wav2 = wav[:, :sep_point * self.hop_length]
|
||||
wav2 = wav[:, : sep_point * self.hop_length]
|
||||
mel = mel[:, :sep_point]
|
||||
|
||||
assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, (
|
||||
ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir)
|
||||
ssl.shape,
|
||||
wav.shape,
|
||||
wav2.shape,
|
||||
mel.shape,
|
||||
sep_point,
|
||||
self.hop_length,
|
||||
sep_point * self.hop_length,
|
||||
dir,
|
||||
)
|
||||
return reference_mel, ssl, wav2, mel
|
||||
class TextAudioSpeakerCollate():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
|
||||
|
||||
class TextAudioSpeakerCollate:
|
||||
"""Zero-pads model inputs and targets"""
|
||||
|
||||
def __init__(self, return_ids=False):
|
||||
self.return_ids = return_ids
|
||||
@@ -184,9 +187,7 @@ class TextAudioSpeakerCollate():
|
||||
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
||||
"""
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
_, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
|
||||
|
||||
max_ssl_len = max([x[0].size(2) for x in batch])
|
||||
max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
|
||||
@@ -214,22 +215,24 @@ class TextAudioSpeakerCollate():
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
||||
ssl = row[0]
|
||||
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
|
||||
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
|
||||
ssl_lengths[i] = ssl.size(2)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_padded[i, :, : spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
wav = row[2]
|
||||
wav_padded[i, :, :wav.size(1)] = wav
|
||||
wav_padded[i, :, : wav.size(1)] = wav
|
||||
wav_lengths[i] = wav.size(1)
|
||||
|
||||
text = row[3]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_padded[i, : text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
|
||||
|
||||
|
||||
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, speaker_id, text pairs
|
||||
@@ -253,7 +256,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
|
||||
for line in lines:
|
||||
tmp = line.split("\t")
|
||||
if (len(tmp) != 4):
|
||||
if len(tmp) != 4:
|
||||
continue
|
||||
self.phoneme_data[tmp[0]] = [tmp[1]]
|
||||
|
||||
@@ -261,7 +264,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
tmp = self.audiopaths_sid_text
|
||||
leng = len(tmp)
|
||||
min_num = 100
|
||||
if (leng < min_num):
|
||||
if leng < min_num:
|
||||
self.audiopaths_sid_text = []
|
||||
for _ in range(max(2, int(min_num / leng))):
|
||||
self.audiopaths_sid_text += tmp
|
||||
@@ -286,7 +289,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
for audiopath in tqdm(self.audiopaths_sid_text):
|
||||
try:
|
||||
phoneme = self.phoneme_data[audiopath][0]
|
||||
phoneme = phoneme.split(' ')
|
||||
phoneme = phoneme.split(" ")
|
||||
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
|
||||
except Exception:
|
||||
print(f"{audiopath} not in self.phoneme_data !")
|
||||
@@ -313,15 +316,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo
|
||||
self.audiopaths_sid_text = audiopaths_sid_text_new
|
||||
self.lengths = lengths
|
||||
self.spec_min=-12
|
||||
self.spec_max=2
|
||||
self.spec_min = -12
|
||||
self.spec_max = 2
|
||||
|
||||
self.filter_length_mel = self.win_length_mel = 1024
|
||||
self.hop_length_mel = 256
|
||||
self.n_mel_channels = 100
|
||||
self.sampling_rate_mel = 24000
|
||||
self.mel_fmin = 0
|
||||
self.mel_fmax = None
|
||||
|
||||
self.filter_length_mel=self.win_length_mel=1024
|
||||
self.hop_length_mel=256
|
||||
self.n_mel_channels=100
|
||||
self.sampling_rate_mel=24000
|
||||
self.mel_fmin=0
|
||||
self.mel_fmax=None
|
||||
def norm_spec(self, x):
|
||||
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
|
||||
|
||||
@@ -332,7 +336,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
|
||||
with torch.no_grad():
|
||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
||||
if (ssl.shape[-1] != spec.shape[-1]):
|
||||
if ssl.shape[-1] != spec.shape[-1]:
|
||||
typee = ssl.dtype
|
||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||
ssl.requires_grad = False
|
||||
@@ -347,25 +351,35 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
return (ssl, spec, mel, text)
|
||||
|
||||
def get_audio(self, filename):
|
||||
audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768
|
||||
audio=torch.FloatTensor(audio_array)#/32768
|
||||
audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768
|
||||
audio = torch.FloatTensor(audio_array) # /32768
|
||||
audio_norm = audio
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速
|
||||
audio24=torch.FloatTensor(audio_array24)#/32768
|
||||
audio_array24 = load_audio(
|
||||
filename, 24000
|
||||
) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速
|
||||
audio24 = torch.FloatTensor(audio_array24) # /32768
|
||||
audio_norm24 = audio24
|
||||
audio_norm24 = audio_norm24.unsqueeze(0)
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = spectrogram_torch(
|
||||
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
|
||||
)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
|
||||
|
||||
spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
|
||||
mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
|
||||
spec1 = spectrogram_torch(
|
||||
audio_norm24,
|
||||
self.filter_length_mel,
|
||||
self.sampling_rate_mel,
|
||||
self.hop_length_mel,
|
||||
self.win_length_mel,
|
||||
center=False,
|
||||
)
|
||||
mel = spec_to_mel_torch(
|
||||
spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
|
||||
)
|
||||
mel = torch.squeeze(mel, 0)
|
||||
mel=self.norm_spec(mel)
|
||||
mel = self.norm_spec(mel)
|
||||
# print(1111111,spec.shape,mel.shape)
|
||||
return spec, mel
|
||||
|
||||
@@ -379,9 +393,10 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_sid_text)
|
||||
class TextAudioSpeakerCollateV3():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
|
||||
|
||||
class TextAudioSpeakerCollateV3:
|
||||
"""Zero-pads model inputs and targets"""
|
||||
|
||||
def __init__(self, return_ids=False):
|
||||
self.return_ids = return_ids
|
||||
@@ -392,12 +407,10 @@ class TextAudioSpeakerCollateV3():
|
||||
------
|
||||
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
||||
"""
|
||||
#ssl, spec, wav,mel, text
|
||||
# ssl, spec, wav,mel, text
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
#(ssl, spec,mel, text)
|
||||
_, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
|
||||
# (ssl, spec,mel, text)
|
||||
max_ssl_len = max([x[0].size(2) for x in batch])
|
||||
|
||||
max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
|
||||
@@ -411,7 +424,7 @@ class TextAudioSpeakerCollateV3():
|
||||
# max_wav_len = max([x[2].size(1) for x in batch])
|
||||
|
||||
max_text_len = max([x[3].size(0) for x in batch])
|
||||
max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
|
||||
max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320
|
||||
|
||||
ssl_lengths = torch.LongTensor(len(batch))
|
||||
spec_lengths = torch.LongTensor(len(batch))
|
||||
@@ -422,7 +435,7 @@ class TextAudioSpeakerCollateV3():
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len)
|
||||
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
# wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
|
||||
spec_padded.zero_()
|
||||
@@ -435,11 +448,11 @@ class TextAudioSpeakerCollateV3():
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
# ssl, spec, wav,mel, text
|
||||
ssl = row[0]
|
||||
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
|
||||
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
|
||||
ssl_lengths[i] = ssl.size(2)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_padded[i, :, : spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
# wav = row[2]
|
||||
@@ -447,15 +460,17 @@ class TextAudioSpeakerCollateV3():
|
||||
# wav_lengths[i] = wav.size(1)
|
||||
|
||||
mel = row[2]
|
||||
mel_padded[i, :, :mel.size(1)] = mel
|
||||
mel_padded[i, :, : mel.size(1)] = mel
|
||||
mel_lengths[i] = mel.size(1)
|
||||
|
||||
text = row[3]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_padded[i, : text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
|
||||
return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
|
||||
return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
|
||||
|
||||
|
||||
class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, speaker_id, text pairs
|
||||
@@ -479,7 +494,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
|
||||
for line in lines:
|
||||
tmp = line.split("\t")
|
||||
if (len(tmp) != 4):
|
||||
if len(tmp) != 4:
|
||||
continue
|
||||
self.phoneme_data[tmp[0]] = [tmp[1]]
|
||||
|
||||
@@ -487,7 +502,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
tmp = self.audiopaths_sid_text
|
||||
leng = len(tmp)
|
||||
min_num = 100
|
||||
if (leng < min_num):
|
||||
if leng < min_num:
|
||||
self.audiopaths_sid_text = []
|
||||
for _ in range(max(2, int(min_num / leng))):
|
||||
self.audiopaths_sid_text += tmp
|
||||
@@ -512,7 +527,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
for audiopath in tqdm(self.audiopaths_sid_text):
|
||||
try:
|
||||
phoneme = self.phoneme_data[audiopath][0]
|
||||
phoneme = phoneme.split(' ')
|
||||
phoneme = phoneme.split(" ")
|
||||
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
|
||||
except Exception:
|
||||
print(f"{audiopath} not in self.phoneme_data !")
|
||||
@@ -539,15 +554,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size,这里todo
|
||||
self.audiopaths_sid_text = audiopaths_sid_text_new
|
||||
self.lengths = lengths
|
||||
self.spec_min=-12
|
||||
self.spec_max=2
|
||||
self.spec_min = -12
|
||||
self.spec_max = 2
|
||||
|
||||
self.filter_length_mel = self.win_length_mel = 1024
|
||||
self.hop_length_mel = 256
|
||||
self.n_mel_channels = 100
|
||||
self.sampling_rate_mel = 24000
|
||||
self.mel_fmin = 0
|
||||
self.mel_fmax = None
|
||||
|
||||
self.filter_length_mel=self.win_length_mel=1024
|
||||
self.hop_length_mel=256
|
||||
self.n_mel_channels=100
|
||||
self.sampling_rate_mel=24000
|
||||
self.mel_fmin=0
|
||||
self.mel_fmax=None
|
||||
def norm_spec(self, x):
|
||||
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
|
||||
|
||||
@@ -555,10 +571,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
audiopath, phoneme_ids = audiopath_sid_text
|
||||
text = torch.FloatTensor(phoneme_ids)
|
||||
try:
|
||||
spec, mel,wav = self.get_audio("%s/%s" % (self.path5, audiopath))
|
||||
spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
|
||||
with torch.no_grad():
|
||||
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
|
||||
if (ssl.shape[-1] != spec.shape[-1]):
|
||||
if ssl.shape[-1] != spec.shape[-1]:
|
||||
typee = ssl.dtype
|
||||
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
|
||||
ssl.requires_grad = False
|
||||
@@ -573,27 +589,37 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
return (ssl, spec, wav, mel, text)
|
||||
|
||||
def get_audio(self, filename):
|
||||
audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768
|
||||
audio=torch.FloatTensor(audio_array)#/32768
|
||||
audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768
|
||||
audio = torch.FloatTensor(audio_array) # /32768
|
||||
audio_norm = audio
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速
|
||||
audio24=torch.FloatTensor(audio_array24)#/32768
|
||||
audio_array24 = load_audio(
|
||||
filename, 24000
|
||||
) # load_audio的方法是已经归一化到-1~1之间的,不用再/32768######这里可以用GPU重采样加速
|
||||
audio24 = torch.FloatTensor(audio_array24) # /32768
|
||||
audio_norm24 = audio24
|
||||
audio_norm24 = audio_norm24.unsqueeze(0)
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = spectrogram_torch(
|
||||
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
|
||||
)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
|
||||
|
||||
spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
|
||||
mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
|
||||
spec1 = spectrogram_torch(
|
||||
audio_norm24,
|
||||
self.filter_length_mel,
|
||||
self.sampling_rate_mel,
|
||||
self.hop_length_mel,
|
||||
self.win_length_mel,
|
||||
center=False,
|
||||
)
|
||||
mel = spec_to_mel_torch(
|
||||
spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
|
||||
)
|
||||
mel = torch.squeeze(mel, 0)
|
||||
mel=self.norm_spec(mel)
|
||||
mel = self.norm_spec(mel)
|
||||
# print(1111111,spec.shape,mel.shape)
|
||||
return spec, mel,audio_norm
|
||||
return spec, mel, audio_norm
|
||||
|
||||
def get_sid(self, sid):
|
||||
sid = torch.LongTensor([int(sid)])
|
||||
@@ -605,9 +631,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_sid_text)
|
||||
class TextAudioSpeakerCollateV3b():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
|
||||
|
||||
class TextAudioSpeakerCollateV3b:
|
||||
"""Zero-pads model inputs and targets"""
|
||||
|
||||
def __init__(self, return_ids=False):
|
||||
self.return_ids = return_ids
|
||||
@@ -618,12 +645,10 @@ class TextAudioSpeakerCollateV3b():
|
||||
------
|
||||
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
||||
"""
|
||||
#ssl, spec, wav,mel, text
|
||||
# ssl, spec, wav,mel, text
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
#(ssl, spec,mel, text)
|
||||
_, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
|
||||
# (ssl, spec,mel, text)
|
||||
max_ssl_len = max([x[0].size(2) for x in batch])
|
||||
|
||||
max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
|
||||
@@ -636,7 +661,7 @@ class TextAudioSpeakerCollateV3b():
|
||||
max_spec_len = int(2 * ((max_spec_len // 2) + 1))
|
||||
max_wav_len = max([x[2].size(1) for x in batch])
|
||||
max_text_len = max([x[4].size(0) for x in batch])
|
||||
max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
|
||||
max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320
|
||||
|
||||
ssl_lengths = torch.LongTensor(len(batch))
|
||||
spec_lengths = torch.LongTensor(len(batch))
|
||||
@@ -647,7 +672,7 @@ class TextAudioSpeakerCollateV3b():
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
mel_padded = torch.FloatTensor(len(batch), batch[0][3].size(0), max_mel_len)
|
||||
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
|
||||
spec_padded.zero_()
|
||||
@@ -660,28 +685,40 @@ class TextAudioSpeakerCollateV3b():
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
# ssl, spec, wav,mel, text
|
||||
ssl = row[0]
|
||||
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
|
||||
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
|
||||
ssl_lengths[i] = ssl.size(2)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_padded[i, :, : spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
wav = row[2]
|
||||
wav_padded[i, :, :wav.size(1)] = wav
|
||||
wav_padded[i, :, : wav.size(1)] = wav
|
||||
wav_lengths[i] = wav.size(1)
|
||||
|
||||
mel = row[3]
|
||||
mel_padded[i, :, :mel.size(1)] = mel
|
||||
mel_padded[i, :, : mel.size(1)] = mel
|
||||
mel_lengths[i] = mel.size(1)
|
||||
|
||||
text = row[4]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_padded[i, : text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
|
||||
return (
|
||||
ssl_padded,
|
||||
spec_padded,
|
||||
mel_padded,
|
||||
ssl_lengths,
|
||||
spec_lengths,
|
||||
text_padded,
|
||||
text_lengths,
|
||||
wav_padded,
|
||||
wav_lengths,
|
||||
mel_lengths,
|
||||
)
|
||||
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
|
||||
|
||||
|
||||
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
"""
|
||||
Maintain similar input lengths in a batch.
|
||||
@@ -745,12 +782,12 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
num_samples_bucket = self.num_samples_per_bucket[i]
|
||||
|
||||
rem = num_samples_bucket - len_bucket
|
||||
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
|
||||
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)]
|
||||
|
||||
ids_bucket = ids_bucket[self.rank::self.num_replicas]
|
||||
ids_bucket = ids_bucket[self.rank :: self.num_replicas]
|
||||
|
||||
for j in range(len(ids_bucket) // self.batch_size):
|
||||
batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
|
||||
batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]]
|
||||
batches.append(batch)
|
||||
|
||||
if self.shuffle:
|
||||
@@ -777,4 +814,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
return -1
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples // self.batch_size
|
||||
return self.num_samples // self.batch_size
|
||||
|
||||
Reference in New Issue
Block a user