Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix

* ruff format --line-length 120 --target-version py39

* Change the link for G2PW Model

* update pytorch version and colab
This commit is contained in:
XXXXRT666
2025-04-07 09:42:47 +01:00
committed by GitHub
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions

View File

@@ -1,24 +1,18 @@
import time
import logging
import os
import random
import traceback
import numpy as np
import torch
import torch.utils.data
from tqdm import tqdm
from module import commons
from module.mel_processing import spectrogram_torch,spec_to_mel_torch
from module.mel_processing import spectrogram_torch, spec_to_mel_torch
from text import cleaned_text_to_sequence
from utils import load_wav_to_torch, load_filepaths_and_text
import torch.nn.functional as F
from functools import lru_cache
import requests
from scipy.io import wavfile
from io import BytesIO
from tools.my_utils import load_audio
version = os.environ.get('version',None)
version = os.environ.get("version", None)
# ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
@@ -43,7 +37,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
for line in lines:
tmp = line.split("\t")
if (len(tmp) != 4):
if len(tmp) != 4:
continue
self.phoneme_data[tmp[0]] = [tmp[1]]
@@ -51,7 +45,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
tmp = self.audiopaths_sid_text
leng = len(tmp)
min_num = 100
if (leng < min_num):
if leng < min_num:
self.audiopaths_sid_text = []
for _ in range(max(2, int(min_num / leng))):
self.audiopaths_sid_text += tmp
@@ -76,7 +70,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
for audiopath in tqdm(self.audiopaths_sid_text):
try:
phoneme = self.phoneme_data[audiopath][0]
phoneme = phoneme.split(' ')
phoneme = phoneme.split(" ")
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
except Exception:
print(f"{audiopath} not in self.phoneme_data !")
@@ -111,7 +105,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
if (ssl.shape[-1] != spec.shape[-1]):
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
@@ -129,8 +123,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
audio = torch.FloatTensor(audio_array) # /32768
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = spectrogram_torch(
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
)
spec = torch.squeeze(spec, 0)
return spec, audio_norm
@@ -146,12 +141,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
return len(self.audiopaths_sid_text)
def random_slice(self, ssl, wav, mel):
assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, (
"first", ssl.shape, wav.shape)
assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ("first", ssl.shape, wav.shape)
len_mel = mel.shape[1]
if self.val:
reference_mel = mel[:, :len_mel // 3]
reference_mel = mel[:, : len_mel // 3]
return reference_mel, ssl, wav, mel
dir = random.randint(0, 1)
sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2))
@@ -159,20 +153,29 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
if dir == 0:
reference_mel = mel[:, :sep_point]
ssl = ssl[:, :, sep_point:]
wav2 = wav[:, sep_point * self.hop_length:]
wav2 = wav[:, sep_point * self.hop_length :]
mel = mel[:, sep_point:]
else:
reference_mel = mel[:, sep_point:]
ssl = ssl[:, :, :sep_point]
wav2 = wav[:, :sep_point * self.hop_length]
wav2 = wav[:, : sep_point * self.hop_length]
mel = mel[:, :sep_point]
assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, (
ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir)
ssl.shape,
wav.shape,
wav2.shape,
mel.shape,
sep_point,
self.hop_length,
sep_point * self.hop_length,
dir,
)
return reference_mel, ssl, wav2, mel
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
class TextAudioSpeakerCollate:
"""Zero-pads model inputs and targets"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
@@ -184,9 +187,7 @@ class TextAudioSpeakerCollate():
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
_, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
max_ssl_len = max([x[0].size(2) for x in batch])
max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
@@ -214,22 +215,24 @@ class TextAudioSpeakerCollate():
row = batch[ids_sorted_decreasing[i]]
ssl = row[0]
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
ssl_lengths[i] = ssl.size(2)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_padded[i, :, : spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_padded[i, :, : wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
text = row[3]
text_padded[i, :text.size(0)] = text
text_padded[i, : text.size(0)] = text
text_lengths[i] = text.size(0)
return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
@@ -253,7 +256,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
for line in lines:
tmp = line.split("\t")
if (len(tmp) != 4):
if len(tmp) != 4:
continue
self.phoneme_data[tmp[0]] = [tmp[1]]
@@ -261,7 +264,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
tmp = self.audiopaths_sid_text
leng = len(tmp)
min_num = 100
if (leng < min_num):
if leng < min_num:
self.audiopaths_sid_text = []
for _ in range(max(2, int(min_num / leng))):
self.audiopaths_sid_text += tmp
@@ -286,7 +289,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
for audiopath in tqdm(self.audiopaths_sid_text):
try:
phoneme = self.phoneme_data[audiopath][0]
phoneme = phoneme.split(' ')
phoneme = phoneme.split(" ")
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
except Exception:
print(f"{audiopath} not in self.phoneme_data !")
@@ -313,15 +316,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size这里todo
self.audiopaths_sid_text = audiopaths_sid_text_new
self.lengths = lengths
self.spec_min=-12
self.spec_max=2
self.spec_min = -12
self.spec_max = 2
self.filter_length_mel = self.win_length_mel = 1024
self.hop_length_mel = 256
self.n_mel_channels = 100
self.sampling_rate_mel = 24000
self.mel_fmin = 0
self.mel_fmax = None
self.filter_length_mel=self.win_length_mel=1024
self.hop_length_mel=256
self.n_mel_channels=100
self.sampling_rate_mel=24000
self.mel_fmin=0
self.mel_fmax=None
def norm_spec(self, x):
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
@@ -332,7 +336,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
if (ssl.shape[-1] != spec.shape[-1]):
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
@@ -347,25 +351,35 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
return (ssl, spec, mel, text)
def get_audio(self, filename):
audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的不用再/32768
audio=torch.FloatTensor(audio_array)#/32768
audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的不用再/32768
audio = torch.FloatTensor(audio_array) # /32768
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的不用再/32768######这里可以用GPU重采样加速
audio24=torch.FloatTensor(audio_array24)#/32768
audio_array24 = load_audio(
filename, 24000
) # load_audio的方法是已经归一化到-1~1之间的不用再/32768######这里可以用GPU重采样加速
audio24 = torch.FloatTensor(audio_array24) # /32768
audio_norm24 = audio24
audio_norm24 = audio_norm24.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = spectrogram_torch(
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
)
spec = torch.squeeze(spec, 0)
spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
spec1 = spectrogram_torch(
audio_norm24,
self.filter_length_mel,
self.sampling_rate_mel,
self.hop_length_mel,
self.win_length_mel,
center=False,
)
mel = spec_to_mel_torch(
spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
)
mel = torch.squeeze(mel, 0)
mel=self.norm_spec(mel)
mel = self.norm_spec(mel)
# print(1111111,spec.shape,mel.shape)
return spec, mel
@@ -379,9 +393,10 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
def __len__(self):
return len(self.audiopaths_sid_text)
class TextAudioSpeakerCollateV3():
""" Zero-pads model inputs and targets
"""
class TextAudioSpeakerCollateV3:
"""Zero-pads model inputs and targets"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
@@ -392,12 +407,10 @@ class TextAudioSpeakerCollateV3():
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
#ssl, spec, wav,mel, text
# ssl, spec, wav,mel, text
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
#(ssl, spec,mel, text)
_, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
# (ssl, spec,mel, text)
max_ssl_len = max([x[0].size(2) for x in batch])
max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
@@ -411,7 +424,7 @@ class TextAudioSpeakerCollateV3():
# max_wav_len = max([x[2].size(1) for x in batch])
max_text_len = max([x[3].size(0) for x in batch])
max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320
ssl_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
@@ -422,7 +435,7 @@ class TextAudioSpeakerCollateV3():
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len)
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
text_padded = torch.LongTensor(len(batch), max_text_len)
text_padded = torch.LongTensor(len(batch), max_text_len)
# wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
spec_padded.zero_()
@@ -435,11 +448,11 @@ class TextAudioSpeakerCollateV3():
row = batch[ids_sorted_decreasing[i]]
# ssl, spec, wav,mel, text
ssl = row[0]
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
ssl_lengths[i] = ssl.size(2)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_padded[i, :, : spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
# wav = row[2]
@@ -447,15 +460,17 @@ class TextAudioSpeakerCollateV3():
# wav_lengths[i] = wav.size(1)
mel = row[2]
mel_padded[i, :, :mel.size(1)] = mel
mel_padded[i, :, : mel.size(1)] = mel
mel_lengths[i] = mel.size(1)
text = row[3]
text_padded[i, :text.size(0)] = text
text_padded[i, : text.size(0)] = text
text_lengths[i] = text.size(0)
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
@@ -479,7 +494,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
for line in lines:
tmp = line.split("\t")
if (len(tmp) != 4):
if len(tmp) != 4:
continue
self.phoneme_data[tmp[0]] = [tmp[1]]
@@ -487,7 +502,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
tmp = self.audiopaths_sid_text
leng = len(tmp)
min_num = 100
if (leng < min_num):
if leng < min_num:
self.audiopaths_sid_text = []
for _ in range(max(2, int(min_num / leng))):
self.audiopaths_sid_text += tmp
@@ -512,7 +527,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
for audiopath in tqdm(self.audiopaths_sid_text):
try:
phoneme = self.phoneme_data[audiopath][0]
phoneme = phoneme.split(' ')
phoneme = phoneme.split(" ")
phoneme_ids = cleaned_text_to_sequence(phoneme, version)
except Exception:
print(f"{audiopath} not in self.phoneme_data !")
@@ -539,15 +554,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
assert len(audiopaths_sid_text_new) > 1 # 至少能凑够batch size这里todo
self.audiopaths_sid_text = audiopaths_sid_text_new
self.lengths = lengths
self.spec_min=-12
self.spec_max=2
self.spec_min = -12
self.spec_max = 2
self.filter_length_mel = self.win_length_mel = 1024
self.hop_length_mel = 256
self.n_mel_channels = 100
self.sampling_rate_mel = 24000
self.mel_fmin = 0
self.mel_fmax = None
self.filter_length_mel=self.win_length_mel=1024
self.hop_length_mel=256
self.n_mel_channels=100
self.sampling_rate_mel=24000
self.mel_fmin=0
self.mel_fmax=None
def norm_spec(self, x):
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
@@ -555,10 +571,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
audiopath, phoneme_ids = audiopath_sid_text
text = torch.FloatTensor(phoneme_ids)
try:
spec, mel,wav = self.get_audio("%s/%s" % (self.path5, audiopath))
spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
with torch.no_grad():
ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
if (ssl.shape[-1] != spec.shape[-1]):
if ssl.shape[-1] != spec.shape[-1]:
typee = ssl.dtype
ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
ssl.requires_grad = False
@@ -573,27 +589,37 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
return (ssl, spec, wav, mel, text)
def get_audio(self, filename):
audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的不用再/32768
audio=torch.FloatTensor(audio_array)#/32768
audio_array = load_audio(filename, self.sampling_rate) # load_audio的方法是已经归一化到-1~1之间的不用再/32768
audio = torch.FloatTensor(audio_array) # /32768
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0)
audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的不用再/32768######这里可以用GPU重采样加速
audio24=torch.FloatTensor(audio_array24)#/32768
audio_array24 = load_audio(
filename, 24000
) # load_audio的方法是已经归一化到-1~1之间的不用再/32768######这里可以用GPU重采样加速
audio24 = torch.FloatTensor(audio_array24) # /32768
audio_norm24 = audio24
audio_norm24 = audio_norm24.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = spectrogram_torch(
audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
)
spec = torch.squeeze(spec, 0)
spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
spec1 = spectrogram_torch(
audio_norm24,
self.filter_length_mel,
self.sampling_rate_mel,
self.hop_length_mel,
self.win_length_mel,
center=False,
)
mel = spec_to_mel_torch(
spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
)
mel = torch.squeeze(mel, 0)
mel=self.norm_spec(mel)
mel = self.norm_spec(mel)
# print(1111111,spec.shape,mel.shape)
return spec, mel,audio_norm
return spec, mel, audio_norm
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
@@ -605,9 +631,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
def __len__(self):
return len(self.audiopaths_sid_text)
class TextAudioSpeakerCollateV3b():
""" Zero-pads model inputs and targets
"""
class TextAudioSpeakerCollateV3b:
"""Zero-pads model inputs and targets"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
@@ -618,12 +645,10 @@ class TextAudioSpeakerCollateV3b():
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
#ssl, spec, wav,mel, text
# ssl, spec, wav,mel, text
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
#(ssl, spec,mel, text)
_, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
# (ssl, spec,mel, text)
max_ssl_len = max([x[0].size(2) for x in batch])
max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
@@ -636,7 +661,7 @@ class TextAudioSpeakerCollateV3b():
max_spec_len = int(2 * ((max_spec_len // 2) + 1))
max_wav_len = max([x[2].size(1) for x in batch])
max_text_len = max([x[4].size(0) for x in batch])
max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
max_mel_len = int(max_ssl_len1 * 1.25 * 1.5) ###24000/256,32000/640=16000/320
ssl_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
@@ -647,7 +672,7 @@ class TextAudioSpeakerCollateV3b():
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
mel_padded = torch.FloatTensor(len(batch), batch[0][3].size(0), max_mel_len)
ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
text_padded = torch.LongTensor(len(batch), max_text_len)
text_padded = torch.LongTensor(len(batch), max_text_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
spec_padded.zero_()
@@ -660,28 +685,40 @@ class TextAudioSpeakerCollateV3b():
row = batch[ids_sorted_decreasing[i]]
# ssl, spec, wav,mel, text
ssl = row[0]
ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
ssl_lengths[i] = ssl.size(2)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_padded[i, :, : spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_padded[i, :, : wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
mel = row[3]
mel_padded[i, :, :mel.size(1)] = mel
mel_padded[i, :, : mel.size(1)] = mel
mel_lengths[i] = mel.size(1)
text = row[4]
text_padded[i, :text.size(0)] = text
text_padded[i, : text.size(0)] = text
text_lengths[i] = text.size(0)
return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
return (
ssl_padded,
spec_padded,
mel_padded,
ssl_lengths,
spec_lengths,
text_padded,
text_lengths,
wav_padded,
wav_lengths,
mel_lengths,
)
# return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
@@ -745,12 +782,12 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
num_samples_bucket = self.num_samples_per_bucket[i]
rem = num_samples_bucket - len_bucket
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)]
ids_bucket = ids_bucket[self.rank::self.num_replicas]
ids_bucket = ids_bucket[self.rank :: self.num_replicas]
for j in range(len(ids_bucket) // self.batch_size):
batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]]
batches.append(batch)
if self.shuffle:
@@ -777,4 +814,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
return -1
def __len__(self):
return self.num_samples // self.batch_size
return self.num_samples // self.batch_size