Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
2025-04-07 09:42:47 +01:00
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions
--- a/GPT_SoVITS/module/attentions.py
+++ b/GPT_SoVITS/module/attentions.py
@@ -18,7 +18,7 @@ class Encoder(nn.Module):
        p_dropout=0.0,
        window_size=4,
        isflow=False,
-        **kwargs
+        **kwargs,
    ):
        super().__init__()
        self.hidden_channels = hidden_channels
@@ -56,9 +56,7 @@ class Encoder(nn.Module):
            )
            self.norm_layers_2.append(LayerNorm(hidden_channels))
        if isflow:
-            cond_layer = torch.nn.Conv1d(
-                kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1
-            )
+            cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1)
            self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
            self.cond_layer = weight_norm_modules(cond_layer, name="weight")
            self.gin_channels = kwargs["gin_channels"]
@@ -74,9 +72,7 @@ class Encoder(nn.Module):
                x = self.cond_pre(x)
                cond_offset = i * 2 * self.hidden_channels
                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
-                x = commons.fused_add_tanh_sigmoid_multiply(
-                    x, g_l, torch.IntTensor([self.hidden_channels])
-                )
+                x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels]))
            y = self.attn_layers[i](x, x, attn_mask)
            y = self.drop(y)
            x = self.norm_layers_1[i](x + y)
@@ -99,7 +95,7 @@ class Decoder(nn.Module):
        p_dropout=0.0,
        proximal_bias=False,
        proximal_init=True,
-        **kwargs
+        **kwargs,
    ):
        super().__init__()
        self.hidden_channels = hidden_channels
@@ -131,9 +127,7 @@ class Decoder(nn.Module):
            )
            self.norm_layers_0.append(LayerNorm(hidden_channels))
            self.encdec_attn_layers.append(
-                MultiHeadAttention(
-                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
-                )
+                MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)
            )
            self.norm_layers_1.append(LayerNorm(hidden_channels))
            self.ffn_layers.append(
@@ -153,9 +147,7 @@ class Decoder(nn.Module):
        x: decoder input
        h: encoder output
        """
-        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
-            device=x.device, dtype=x.dtype
-        )
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
        x = x * x_mask
        for i in range(self.n_layers):
@@ -211,14 +203,8 @@ class MultiHeadAttention(nn.Module):
        if window_size is not None:
            n_heads_rel = 1 if heads_share else n_heads
            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)

        nn.init.xavier_uniform_(self.conv_q.weight)
        nn.init.xavier_uniform_(self.conv_k.weight)
@@ -247,46 +233,28 @@ class MultiHeadAttention(nn.Module):

        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
        if self.window_size is not None:
-            assert (
-                t_s == t_t
-            ), "Relative attention is only available for self-attention."
+            assert t_s == t_t, "Relative attention is only available for self-attention."
            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(
-                query / math.sqrt(self.k_channels), key_relative_embeddings
-            )
+            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
            scores_local = self._relative_position_to_absolute_position(rel_logits)
            scores = scores + scores_local
        if self.proximal_bias:
            assert t_s == t_t, "Proximal bias is only available for self-attention."
-            scores = scores + self._attention_bias_proximal(t_s).to(
-                device=scores.device, dtype=scores.dtype
-            )
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e4)
            if self.block_length is not None:
-                assert (
-                    t_s == t_t
-                ), "Local attention is only available for self-attention."
-                block_mask = (
-                    torch.ones_like(scores)
-                    .triu(-self.block_length)
-                    .tril(self.block_length)
-                )
+                assert t_s == t_t, "Local attention is only available for self-attention."
+                block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
                scores = scores.masked_fill(block_mask == 0, -1e4)
        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
        p_attn = self.drop(p_attn)
        output = torch.matmul(p_attn, value)
        if self.window_size is not None:
            relative_weights = self._absolute_position_to_relative_position(p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(
-                self.emb_rel_v, t_s
-            )
-            output = output + self._matmul_with_relative_values(
-                relative_weights, value_relative_embeddings
-            )
-        output = (
-            output.transpose(2, 3).contiguous().view(b, d, t_t)
-        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)  # [b, n_h, t_t, d_k] -> [b, d, t_t]
        return output, p_attn

    def _matmul_with_relative_values(self, x, y):
@@ -320,9 +288,7 @@ class MultiHeadAttention(nn.Module):
            )
        else:
            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[
-            :, slice_start_position:slice_end_position
-        ]
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
        return used_relative_embeddings

    def _relative_position_to_absolute_position(self, x):
@@ -336,14 +302,10 @@ class MultiHeadAttention(nn.Module):

        # Concat extra elements so to add up to shape (len+1, 2*len-1).
        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(
-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
-        )
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))

        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
-            :, :, :length, length - 1 :
-        ]
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
        return x_final

    def _absolute_position_to_relative_position(self, x):
@@ -353,9 +315,7 @@ class MultiHeadAttention(nn.Module):
        """
        batch, heads, length, _ = x.size()
        # padd along column
-        x = F.pad(
-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
-        )
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
        # add 0's in the beginning that will skew the elements after reshape
        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
@@ -537,9 +497,7 @@ class Depthwise_Separable_TransposeConv1D(nn.Module):


 def weight_norm_modules(module, name="weight", dim=0):
-    if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(
-        module, Depthwise_Separable_TransposeConv1D
-    ):
+    if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D):
        module.weight_norm()
        return module
    else:
@@ -547,9 +505,7 @@ def weight_norm_modules(module, name="weight", dim=0):


 def remove_weight_norm_modules(module, name="weight"):
-    if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(
-        module, Depthwise_Separable_TransposeConv1D
-    ):
+    if isinstance(module, Depthwise_Separable_Conv1D) or isinstance(module, Depthwise_Separable_TransposeConv1D):
        module.remove_weight_norm()
    else:
        remove_weight_norm(module, name)
@@ -567,7 +523,7 @@ class FFT(nn.Module):
        proximal_bias=False,
        proximal_init=True,
        isflow=False,
-        **kwargs
+        **kwargs,
    ):
        super().__init__()
        self.hidden_channels = hidden_channels
@@ -579,9 +535,7 @@ class FFT(nn.Module):
        self.proximal_bias = proximal_bias
        self.proximal_init = proximal_init
        if isflow:
-            cond_layer = torch.nn.Conv1d(
-                kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1
-            )
+            cond_layer = torch.nn.Conv1d(kwargs["gin_channels"], 2 * hidden_channels * n_layers, 1)
            self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
            self.cond_layer = weight_norm_modules(cond_layer, name="weight")
            self.gin_channels = kwargs["gin_channels"]
@@ -622,18 +576,14 @@ class FFT(nn.Module):
        if g is not None:
            g = self.cond_layer(g)

-        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
-            device=x.device, dtype=x.dtype
-        )
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
        x = x * x_mask
        for i in range(self.n_layers):
            if g is not None:
                x = self.cond_pre(x)
                cond_offset = i * 2 * self.hidden_channels
                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
-                x = commons.fused_add_tanh_sigmoid_multiply(
-                    x, g_l, torch.IntTensor([self.hidden_channels])
-                )
+                x = commons.fused_add_tanh_sigmoid_multiply(x, g_l, torch.IntTensor([self.hidden_channels]))
            y = self.self_attn_layers[i](x, x, self_attn_mask)
            y = self.drop(y)
            x = self.norm_layers_0[i](x + y)
--- a/GPT_SoVITS/module/attentions_onnx.py
+++ b/GPT_SoVITS/module/attentions_onnx.py
@@ -7,6 +7,7 @@ from module import commons

 from typing import Optional

+
 class LayerNorm(nn.Module):
    def __init__(self, channels, eps=1e-5):
        super().__init__()
@@ -43,7 +44,7 @@ class Encoder(nn.Module):
        p_dropout=0.0,
        window_size=4,
        isflow=True,
-        **kwargs
+        **kwargs,
    ):
        super().__init__()
        self.hidden_channels = hidden_channels
@@ -65,13 +66,9 @@ class Encoder(nn.Module):
            if self.gin_channels != 0:
                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
                # vits2 says 3rd block, so idx is 2 by default
-                self.cond_layer_idx = (
-                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
-                )
+                self.cond_layer_idx = kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
                logging.debug(self.gin_channels, self.cond_layer_idx)
-                assert (
-                    self.cond_layer_idx < self.n_layers
-                ), "cond_layer_idx should be less than n_layers"
+                assert self.cond_layer_idx < self.n_layers, "cond_layer_idx should be less than n_layers"
        self.drop = nn.Dropout(p_dropout)
        self.attn_layers = nn.ModuleList()
        self.norm_layers_1 = nn.ModuleList()
@@ -117,11 +114,13 @@ class Encoder(nn.Module):
    #         x = self.norm_layers_2[i](x + y)
    #     x = x * x_mask
    #     return x
-    
+
    def forward(self, x, x_mask):
        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
        x = x * x_mask
-        for attn_layers,norm_layers_1,ffn_layers,norm_layers_2 in zip(self.attn_layers,self.norm_layers_1,self.ffn_layers,self.norm_layers_2):
+        for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zip(
+            self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
+        ):
            y = attn_layers(x, x, attn_mask)
            y = self.drop(y)
            x = norm_layers_1(x + y)
@@ -170,14 +169,8 @@ class MultiHeadAttention(nn.Module):
        if window_size is not None:
            n_heads_rel = 1 if heads_share else n_heads
            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)

        nn.init.xavier_uniform_(self.conv_q.weight)
        nn.init.xavier_uniform_(self.conv_k.weight)
@@ -187,7 +180,7 @@ class MultiHeadAttention(nn.Module):
                self.conv_k.weight.copy_(self.conv_q.weight)
                self.conv_k.bias.copy_(self.conv_q.bias)

-    def forward(self, x, c, attn_mask:Optional[torch.Tensor]=None):
+    def forward(self, x, c, attn_mask: Optional[torch.Tensor] = None):
        q = self.conv_q(x)
        k = self.conv_k(c)
        v = self.conv_v(c)
@@ -198,7 +191,7 @@ class MultiHeadAttention(nn.Module):
        x = self.conv_o(x)
        return x

-    def attention(self, query, key, value, mask:Optional[torch.Tensor]=None):
+    def attention(self, query, key, value, mask: Optional[torch.Tensor] = None):
        # reshape [b, d, t] -> [b, n_h, t, d_k]
        b, d, t_s, _ = (*key.size(), query.size(2))
        query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
@@ -223,8 +216,8 @@ class MultiHeadAttention(nn.Module):
            relative_weights = self._absolute_position_to_relative_position(p_attn)
            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-        
-        output = (output.transpose(2, 3).contiguous().view(b, d, -1))
+
+        output = output.transpose(2, 3).contiguous().view(b, d, -1)
        return output, p_attn

    def _matmul_with_relative_values(self, x, y):
@@ -248,19 +241,17 @@ class MultiHeadAttention(nn.Module):
    def _get_relative_embeddings(self, relative_embeddings, length):
        max_relative_position = 2 * self.window_size + 1
        # Pad first before slice to avoid using cond ops.
-        pad_l = torch.zeros((1), dtype = torch.int64) + length - (self.window_size + 1)
-        pad_s = torch.zeros((1), dtype = torch.int64) + (self.window_size + 1) - length
-        pad_length = torch.max(pad_l, other=torch.zeros((1), dtype = torch.int64))
-        slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype = torch.int64))
+        pad_l = torch.zeros((1), dtype=torch.int64) + length - (self.window_size + 1)
+        pad_s = torch.zeros((1), dtype=torch.int64) + (self.window_size + 1) - length
+        pad_length = torch.max(pad_l, other=torch.zeros((1), dtype=torch.int64))
+        slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype=torch.int64))

        slice_end_position = slice_start_position + 2 * length - 1
        padded_relative_embeddings = F.pad(
            relative_embeddings,
            commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
        )
-        used_relative_embeddings = padded_relative_embeddings[
-            :, slice_start_position:slice_end_position
-        ]
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
        return used_relative_embeddings

    def _relative_position_to_absolute_position(self, x):
@@ -274,14 +265,10 @@ class MultiHeadAttention(nn.Module):

        # Concat extra elements so to add up to shape (len+1, 2*len-1).
        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(
-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
-        )
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))

        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
-            :, :, :length, length - 1 :
-        ]
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
        return x_final

    def _absolute_position_to_relative_position(self, x):
@@ -291,9 +278,7 @@ class MultiHeadAttention(nn.Module):
        """
        batch, heads, length, _ = x.size()
        # padd along column
-        x = F.pad(
-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
-        )
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
        # add 0's in the beginning that will skew the elements after reshape
        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
@@ -351,7 +336,7 @@ class FFN(nn.Module):
        x = self.drop(x)
        x = self.conv_2(self.padding(x * x_mask))
        return x * x_mask
-    
+
    def padding(self, x):
        return self._same_padding(x)

@@ -395,12 +380,6 @@ class MRTE(nn.Module):

        ssl_enc = self.c_pre(ssl_enc * ssl_mask)
        text_enc = self.text_pre(text * text_mask)
-        x = (
-                self.cross_attention(
-                    ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
-                )
-                + ssl_enc
-                + ge
-            )
+        x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge
        x = self.c_post(x * ssl_mask)
        return x
--- a/GPT_SoVITS/module/commons.py
+++ b/GPT_SoVITS/module/commons.py
@@ -28,9 +28,7 @@ def intersperse(lst, item):
 def kl_divergence(m_p, logs_p, m_q, logs_q):
    """KL(P||Q)"""
    kl = (logs_q - logs_p) - 0.5
-    kl += (
-        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
-    )
+    kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
    return kl


@@ -67,9 +65,7 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4):
 def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
    position = torch.arange(length, dtype=torch.float)
    num_timescales = channels // 2
-    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
-        num_timescales - 1
-    )
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)
    inv_timescales = min_timescale * torch.exp(
        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
    )
--- a/GPT_SoVITS/module/core_vq.py
+++ b/GPT_SoVITS/module/core_vq.py
@@ -30,6 +30,7 @@
 # SOFTWARE.

 """Core vector quantization implementation."""
+
 import typing as tp

 from einops import rearrange, repeat
@@ -121,9 +122,7 @@ class EuclideanCodebook(nn.Module):
    ):
        super().__init__()
        self.decay = decay
-        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = (
-            uniform_init if not kmeans_init else torch.zeros
-        )
+        init_fn: tp.Union[tp.Callable[..., torch.Tensor], tp.Any] = uniform_init if not kmeans_init else torch.zeros
        embed = init_fn(codebook_size, dim)

        self.codebook_size = codebook_size
@@ -151,9 +150,7 @@ class EuclideanCodebook(nn.Module):
        # broadcast_tensors(self.buffers())

    def replace_(self, samples, mask):
-        modified_codebook = torch.where(
-            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
-        )
+        modified_codebook = torch.where(mask[..., None], sample_vectors(samples, self.codebook_size), self.embed)
        self.embed.data.copy_(modified_codebook)

    def expire_codes_(self, batch_samples):
@@ -174,11 +171,7 @@ class EuclideanCodebook(nn.Module):

    def quantize(self, x):
        embed = self.embed.t()
-        dist = -(
-            x.pow(2).sum(1, keepdim=True)
-            - 2 * x @ embed
-            + embed.pow(2).sum(0, keepdim=True)
-        )
+        dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed + embed.pow(2).sum(0, keepdim=True))
        embed_ind = dist.max(dim=-1).indices
        return embed_ind

@@ -222,8 +215,7 @@ class EuclideanCodebook(nn.Module):
            embed_sum = x.t() @ embed_onehot
            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
            cluster_size = (
-                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon)
-                * self.cluster_size.sum()
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.epsilon) * self.cluster_size.sum()
            )
            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
            self.embed.data.copy_(embed_normalized)
@@ -264,12 +256,8 @@ class VectorQuantization(nn.Module):
        _codebook_dim: int = default(codebook_dim, dim)

        requires_projection = _codebook_dim != dim
-        self.project_in = (
-            nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
-        )
-        self.project_out = (
-            nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()
-        )
+        self.project_in = nn.Linear(dim, _codebook_dim) if requires_projection else nn.Identity()
+        self.project_out = nn.Linear(_codebook_dim, dim) if requires_projection else nn.Identity()

        self.epsilon = epsilon
        self.commitment_weight = commitment_weight
@@ -330,13 +318,9 @@ class ResidualVectorQuantization(nn.Module):

    def __init__(self, *, num_quantizers, **kwargs):
        super().__init__()
-        self.layers = nn.ModuleList(
-            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
-        )
+        self.layers = nn.ModuleList([VectorQuantization(**kwargs) for _ in range(num_quantizers)])

-    def forward(
-        self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None
-    ):
+    def forward(self, x, n_q: tp.Optional[int] = None, layers: tp.Optional[list] = None):
        quantized_out = 0.0
        residual = x

@@ -359,9 +343,7 @@ class ResidualVectorQuantization(nn.Module):
        out_losses, out_indices = map(torch.stack, (all_losses, all_indices))
        return quantized_out, out_indices, out_losses, out_quantized

-    def encode(
-        self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
-    ) -> torch.Tensor:
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor:
        residual = x
        all_indices = []
        n_q = n_q or len(self.layers)
--- a/GPT_SoVITS/module/data_utils.py
+++ b/GPT_SoVITS/module/data_utils.py
@@ -1,24 +1,18 @@
-import time
-import logging
 import os
 import random
 import traceback
-import numpy as np
 import torch
 import torch.utils.data
 from tqdm import tqdm

-from module import commons
-from module.mel_processing import spectrogram_torch,spec_to_mel_torch
+from module.mel_processing import spectrogram_torch, spec_to_mel_torch
 from text import cleaned_text_to_sequence
-from utils import load_wav_to_torch, load_filepaths_and_text
 import torch.nn.functional as F
-from functools import lru_cache
-import requests
-from scipy.io import wavfile
-from io import BytesIO
 from tools.my_utils import load_audio
-version = os.environ.get('version',None)
+
+version = os.environ.get("version", None)
+
+
 # ZeroDivisionError fixed by Tybost (https://github.com/RVC-Boss/GPT-SoVITS/issues/79)
 class TextAudioSpeakerLoader(torch.utils.data.Dataset):
    """
@@ -43,7 +37,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):

        for line in lines:
            tmp = line.split("\t")
-            if (len(tmp) != 4):
+            if len(tmp) != 4:
                continue
            self.phoneme_data[tmp[0]] = [tmp[1]]

@@ -51,7 +45,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
        tmp = self.audiopaths_sid_text
        leng = len(tmp)
        min_num = 100
-        if (leng < min_num):
+        if leng < min_num:
            self.audiopaths_sid_text = []
            for _ in range(max(2, int(min_num / leng))):
                self.audiopaths_sid_text += tmp
@@ -76,7 +70,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
        for audiopath in tqdm(self.audiopaths_sid_text):
            try:
                phoneme = self.phoneme_data[audiopath][0]
-                phoneme = phoneme.split(' ')
+                phoneme = phoneme.split(" ")
                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
            except Exception:
                print(f"{audiopath} not in self.phoneme_data !")
@@ -111,7 +105,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
            spec, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
            with torch.no_grad():
                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
-                if (ssl.shape[-1] != spec.shape[-1]):
+                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
                ssl.requires_grad = False
@@ -129,8 +123,9 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
        audio = torch.FloatTensor(audio_array)  # /32768
        audio_norm = audio
        audio_norm = audio_norm.unsqueeze(0)
-        spec = spectrogram_torch(audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length,
-                                  center=False)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
        spec = torch.squeeze(spec, 0)
        return spec, audio_norm

@@ -146,12 +141,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
        return len(self.audiopaths_sid_text)

    def random_slice(self, ssl, wav, mel):
-        assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, (
-        "first", ssl.shape, wav.shape)
+        assert abs(ssl.shape[-1] - wav.shape[-1] // self.hop_length) < 3, ("first", ssl.shape, wav.shape)

        len_mel = mel.shape[1]
        if self.val:
-            reference_mel = mel[:, :len_mel // 3]
+            reference_mel = mel[:, : len_mel // 3]
            return reference_mel, ssl, wav, mel
        dir = random.randint(0, 1)
        sep_point = random.randint(int(len_mel // 3), int(len_mel // 3 * 2))
@@ -159,20 +153,29 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
        if dir == 0:
            reference_mel = mel[:, :sep_point]
            ssl = ssl[:, :, sep_point:]
-            wav2 = wav[:, sep_point * self.hop_length:]
+            wav2 = wav[:, sep_point * self.hop_length :]
            mel = mel[:, sep_point:]
        else:
            reference_mel = mel[:, sep_point:]
            ssl = ssl[:, :, :sep_point]
-            wav2 = wav[:, :sep_point * self.hop_length]
+            wav2 = wav[:, : sep_point * self.hop_length]
            mel = mel[:, :sep_point]

        assert abs(ssl.shape[-1] - wav2.shape[-1] // self.hop_length) < 3, (
-        ssl.shape, wav.shape, wav2.shape, mel.shape, sep_point, self.hop_length, sep_point * self.hop_length, dir)
+            ssl.shape,
+            wav.shape,
+            wav2.shape,
+            mel.shape,
+            sep_point,
+            self.hop_length,
+            sep_point * self.hop_length,
+            dir,
+        )
        return reference_mel, ssl, wav2, mel
-class TextAudioSpeakerCollate():
-    """ Zero-pads model inputs and targets
-    """
+
+
+class TextAudioSpeakerCollate:
+    """Zero-pads model inputs and targets"""

    def __init__(self, return_ids=False):
        self.return_ids = return_ids
@@ -184,9 +187,7 @@ class TextAudioSpeakerCollate():
        batch: [text_normalized, spec_normalized, wav_normalized, sid]
        """
        # Right zero-pad all one-hot text sequences to max input length
-        _, ids_sorted_decreasing = torch.sort(
-            torch.LongTensor([x[1].size(1) for x in batch]),
-            dim=0, descending=True)
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)

        max_ssl_len = max([x[0].size(2) for x in batch])
        max_ssl_len = int(2 * ((max_ssl_len // 2) + 1))
@@ -214,22 +215,24 @@ class TextAudioSpeakerCollate():
            row = batch[ids_sorted_decreasing[i]]

            ssl = row[0]
-            ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
            ssl_lengths[i] = ssl.size(2)

            spec = row[1]
-            spec_padded[i, :, :spec.size(1)] = spec
+            spec_padded[i, :, : spec.size(1)] = spec
            spec_lengths[i] = spec.size(1)

            wav = row[2]
-            wav_padded[i, :, :wav.size(1)] = wav
+            wav_padded[i, :, : wav.size(1)] = wav
            wav_lengths[i] = wav.size(1)

            text = row[3]
-            text_padded[i, :text.size(0)] = text
+            text_padded[i, : text.size(0)] = text
            text_lengths[i] = text.size(0)

        return ssl_padded, ssl_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, text_padded, text_lengths
+
+
 class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
    """
    1) loads audio, speaker_id, text pairs
@@ -253,7 +256,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):

        for line in lines:
            tmp = line.split("\t")
-            if (len(tmp) != 4):
+            if len(tmp) != 4:
                continue
            self.phoneme_data[tmp[0]] = [tmp[1]]

@@ -261,7 +264,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
        tmp = self.audiopaths_sid_text
        leng = len(tmp)
        min_num = 100
-        if (leng < min_num):
+        if leng < min_num:
            self.audiopaths_sid_text = []
            for _ in range(max(2, int(min_num / leng))):
                self.audiopaths_sid_text += tmp
@@ -286,7 +289,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
        for audiopath in tqdm(self.audiopaths_sid_text):
            try:
                phoneme = self.phoneme_data[audiopath][0]
-                phoneme = phoneme.split(' ')
+                phoneme = phoneme.split(" ")
                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
            except Exception:
                print(f"{audiopath} not in self.phoneme_data !")
@@ -313,15 +316,16 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
        assert len(audiopaths_sid_text_new) > 1  # 至少能凑够batch size，这里todo
        self.audiopaths_sid_text = audiopaths_sid_text_new
        self.lengths = lengths
-        self.spec_min=-12
-        self.spec_max=2
+        self.spec_min = -12
+        self.spec_max = 2
+
+        self.filter_length_mel = self.win_length_mel = 1024
+        self.hop_length_mel = 256
+        self.n_mel_channels = 100
+        self.sampling_rate_mel = 24000
+        self.mel_fmin = 0
+        self.mel_fmax = None

-        self.filter_length_mel=self.win_length_mel=1024
-        self.hop_length_mel=256
-        self.n_mel_channels=100
-        self.sampling_rate_mel=24000
-        self.mel_fmin=0
-        self.mel_fmax=None
    def norm_spec(self, x):
        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1

@@ -332,7 +336,7 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
            spec, mel = self.get_audio("%s/%s" % (self.path5, audiopath))
            with torch.no_grad():
                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
-                if (ssl.shape[-1] != spec.shape[-1]):
+                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
                ssl.requires_grad = False
@@ -347,25 +351,35 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):
        return (ssl, spec, mel, text)

    def get_audio(self, filename):
-        audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的，不用再/32768
-        audio=torch.FloatTensor(audio_array)#/32768
+        audio_array = load_audio(filename, self.sampling_rate)  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768
+        audio = torch.FloatTensor(audio_array)  # /32768
        audio_norm = audio
        audio_norm = audio_norm.unsqueeze(0)
-        audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的，不用再/32768######这里可以用GPU重采样加速
-        audio24=torch.FloatTensor(audio_array24)#/32768
+        audio_array24 = load_audio(
+            filename, 24000
+        )  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768######这里可以用GPU重采样加速
+        audio24 = torch.FloatTensor(audio_array24)  # /32768
        audio_norm24 = audio24
        audio_norm24 = audio_norm24.unsqueeze(0)

-        spec = spectrogram_torch(audio_norm, self.filter_length,
-                                 self.sampling_rate, self.hop_length, self.win_length,
-                                 center=False)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
        spec = torch.squeeze(spec, 0)

-
-        spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
-        mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
+        spec1 = spectrogram_torch(
+            audio_norm24,
+            self.filter_length_mel,
+            self.sampling_rate_mel,
+            self.hop_length_mel,
+            self.win_length_mel,
+            center=False,
+        )
+        mel = spec_to_mel_torch(
+            spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
+        )
        mel = torch.squeeze(mel, 0)
-        mel=self.norm_spec(mel)
+        mel = self.norm_spec(mel)
        # print(1111111,spec.shape,mel.shape)
        return spec, mel

@@ -379,9 +393,10 @@ class TextAudioSpeakerLoaderV3(torch.utils.data.Dataset):

    def __len__(self):
        return len(self.audiopaths_sid_text)
-class TextAudioSpeakerCollateV3():
-    """ Zero-pads model inputs and targets
-    """
+
+
+class TextAudioSpeakerCollateV3:
+    """Zero-pads model inputs and targets"""

    def __init__(self, return_ids=False):
        self.return_ids = return_ids
@@ -392,12 +407,10 @@ class TextAudioSpeakerCollateV3():
        ------
        batch: [text_normalized, spec_normalized, wav_normalized, sid]
        """
-        #ssl, spec, wav,mel, text
+        # ssl, spec, wav,mel, text
        # Right zero-pad all one-hot text sequences to max input length
-        _, ids_sorted_decreasing = torch.sort(
-            torch.LongTensor([x[1].size(1) for x in batch]),
-            dim=0, descending=True)
-#(ssl, spec,mel, text)
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
+        # (ssl, spec,mel, text)
        max_ssl_len = max([x[0].size(2) for x in batch])

        max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
@@ -411,7 +424,7 @@ class TextAudioSpeakerCollateV3():
        # max_wav_len = max([x[2].size(1) for x in batch])

        max_text_len = max([x[3].size(0) for x in batch])
-        max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
+        max_mel_len = int(max_ssl_len1 * 1.25 * 1.5)  ###24000/256,32000/640=16000/320

        ssl_lengths = torch.LongTensor(len(batch))
        spec_lengths = torch.LongTensor(len(batch))
@@ -422,7 +435,7 @@ class TextAudioSpeakerCollateV3():
        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
        mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_mel_len)
        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
-        text_padded = torch.LongTensor(len(batch),  max_text_len)
+        text_padded = torch.LongTensor(len(batch), max_text_len)
        # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

        spec_padded.zero_()
@@ -435,11 +448,11 @@ class TextAudioSpeakerCollateV3():
            row = batch[ids_sorted_decreasing[i]]
            # ssl, spec, wav,mel, text
            ssl = row[0]
-            ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
            ssl_lengths[i] = ssl.size(2)

            spec = row[1]
-            spec_padded[i, :, :spec.size(1)] = spec
+            spec_padded[i, :, : spec.size(1)] = spec
            spec_lengths[i] = spec.size(1)

            # wav = row[2]
@@ -447,15 +460,17 @@ class TextAudioSpeakerCollateV3():
            # wav_lengths[i] = wav.size(1)

            mel = row[2]
-            mel_padded[i, :, :mel.size(1)] = mel
+            mel_padded[i, :, : mel.size(1)] = mel
            mel_lengths[i] = mel.size(1)

            text = row[3]
-            text_padded[i, :text.size(0)] = text
+            text_padded[i, : text.size(0)] = text
            text_lengths[i] = text.size(0)

        # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
-        return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths
+        return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths
+
+
 class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
    """
    1) loads audio, speaker_id, text pairs
@@ -479,7 +494,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):

        for line in lines:
            tmp = line.split("\t")
-            if (len(tmp) != 4):
+            if len(tmp) != 4:
                continue
            self.phoneme_data[tmp[0]] = [tmp[1]]

@@ -487,7 +502,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
        tmp = self.audiopaths_sid_text
        leng = len(tmp)
        min_num = 100
-        if (leng < min_num):
+        if leng < min_num:
            self.audiopaths_sid_text = []
            for _ in range(max(2, int(min_num / leng))):
                self.audiopaths_sid_text += tmp
@@ -512,7 +527,7 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
        for audiopath in tqdm(self.audiopaths_sid_text):
            try:
                phoneme = self.phoneme_data[audiopath][0]
-                phoneme = phoneme.split(' ')
+                phoneme = phoneme.split(" ")
                phoneme_ids = cleaned_text_to_sequence(phoneme, version)
            except Exception:
                print(f"{audiopath} not in self.phoneme_data !")
@@ -539,15 +554,16 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
        assert len(audiopaths_sid_text_new) > 1  # 至少能凑够batch size，这里todo
        self.audiopaths_sid_text = audiopaths_sid_text_new
        self.lengths = lengths
-        self.spec_min=-12
-        self.spec_max=2
+        self.spec_min = -12
+        self.spec_max = 2
+
+        self.filter_length_mel = self.win_length_mel = 1024
+        self.hop_length_mel = 256
+        self.n_mel_channels = 100
+        self.sampling_rate_mel = 24000
+        self.mel_fmin = 0
+        self.mel_fmax = None

-        self.filter_length_mel=self.win_length_mel=1024
-        self.hop_length_mel=256
-        self.n_mel_channels=100
-        self.sampling_rate_mel=24000
-        self.mel_fmin=0
-        self.mel_fmax=None
    def norm_spec(self, x):
        return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1

@@ -555,10 +571,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
        audiopath, phoneme_ids = audiopath_sid_text
        text = torch.FloatTensor(phoneme_ids)
        try:
-            spec, mel,wav = self.get_audio("%s/%s" % (self.path5, audiopath))
+            spec, mel, wav = self.get_audio("%s/%s" % (self.path5, audiopath))
            with torch.no_grad():
                ssl = torch.load("%s/%s.pt" % (self.path4, audiopath), map_location="cpu")
-                if (ssl.shape[-1] != spec.shape[-1]):
+                if ssl.shape[-1] != spec.shape[-1]:
                    typee = ssl.dtype
                    ssl = F.pad(ssl.float(), (0, 1), mode="replicate").to(typee)
                ssl.requires_grad = False
@@ -573,27 +589,37 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):
        return (ssl, spec, wav, mel, text)

    def get_audio(self, filename):
-        audio_array = load_audio(filename,self.sampling_rate)#load_audio的方法是已经归一化到-1~1之间的，不用再/32768
-        audio=torch.FloatTensor(audio_array)#/32768
+        audio_array = load_audio(filename, self.sampling_rate)  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768
+        audio = torch.FloatTensor(audio_array)  # /32768
        audio_norm = audio
        audio_norm = audio_norm.unsqueeze(0)
-        audio_array24 = load_audio(filename,24000)#load_audio的方法是已经归一化到-1~1之间的，不用再/32768######这里可以用GPU重采样加速
-        audio24=torch.FloatTensor(audio_array24)#/32768
+        audio_array24 = load_audio(
+            filename, 24000
+        )  # load_audio的方法是已经归一化到-1~1之间的，不用再/32768######这里可以用GPU重采样加速
+        audio24 = torch.FloatTensor(audio_array24)  # /32768
        audio_norm24 = audio24
        audio_norm24 = audio_norm24.unsqueeze(0)

-        spec = spectrogram_torch(audio_norm, self.filter_length,
-                                 self.sampling_rate, self.hop_length, self.win_length,
-                                 center=False)
+        spec = spectrogram_torch(
+            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
+        )
        spec = torch.squeeze(spec, 0)

-
-        spec1 = spectrogram_torch(audio_norm24, self.filter_length_mel,self.sampling_rate_mel, self.hop_length_mel, self.win_length_mel,center=False)
-        mel = spec_to_mel_torch(spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax)
+        spec1 = spectrogram_torch(
+            audio_norm24,
+            self.filter_length_mel,
+            self.sampling_rate_mel,
+            self.hop_length_mel,
+            self.win_length_mel,
+            center=False,
+        )
+        mel = spec_to_mel_torch(
+            spec1, self.filter_length_mel, self.n_mel_channels, self.sampling_rate_mel, self.mel_fmin, self.mel_fmax
+        )
        mel = torch.squeeze(mel, 0)
-        mel=self.norm_spec(mel)
+        mel = self.norm_spec(mel)
        # print(1111111,spec.shape,mel.shape)
-        return spec, mel,audio_norm
+        return spec, mel, audio_norm

    def get_sid(self, sid):
        sid = torch.LongTensor([int(sid)])
@@ -605,9 +631,10 @@ class TextAudioSpeakerLoaderV3b(torch.utils.data.Dataset):

    def __len__(self):
        return len(self.audiopaths_sid_text)
-class TextAudioSpeakerCollateV3b():
-    """ Zero-pads model inputs and targets
-    """
+
+
+class TextAudioSpeakerCollateV3b:
+    """Zero-pads model inputs and targets"""

    def __init__(self, return_ids=False):
        self.return_ids = return_ids
@@ -618,12 +645,10 @@ class TextAudioSpeakerCollateV3b():
        ------
        batch: [text_normalized, spec_normalized, wav_normalized, sid]
        """
-        #ssl, spec, wav,mel, text
+        # ssl, spec, wav,mel, text
        # Right zero-pad all one-hot text sequences to max input length
-        _, ids_sorted_decreasing = torch.sort(
-            torch.LongTensor([x[1].size(1) for x in batch]),
-            dim=0, descending=True)
-#(ssl, spec,mel, text)
+        _, ids_sorted_decreasing = torch.sort(torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True)
+        # (ssl, spec,mel, text)
        max_ssl_len = max([x[0].size(2) for x in batch])

        max_ssl_len1 = int(8 * ((max_ssl_len // 8) + 1))
@@ -636,7 +661,7 @@ class TextAudioSpeakerCollateV3b():
        max_spec_len = int(2 * ((max_spec_len // 2) + 1))
        max_wav_len = max([x[2].size(1) for x in batch])
        max_text_len = max([x[4].size(0) for x in batch])
-        max_mel_len=int(max_ssl_len1*1.25*1.5)###24000/256,32000/640=16000/320
+        max_mel_len = int(max_ssl_len1 * 1.25 * 1.5)  ###24000/256,32000/640=16000/320

        ssl_lengths = torch.LongTensor(len(batch))
        spec_lengths = torch.LongTensor(len(batch))
@@ -647,7 +672,7 @@ class TextAudioSpeakerCollateV3b():
        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
        mel_padded = torch.FloatTensor(len(batch), batch[0][3].size(0), max_mel_len)
        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
-        text_padded = torch.LongTensor(len(batch),  max_text_len)
+        text_padded = torch.LongTensor(len(batch), max_text_len)
        wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

        spec_padded.zero_()
@@ -660,28 +685,40 @@ class TextAudioSpeakerCollateV3b():
            row = batch[ids_sorted_decreasing[i]]
            # ssl, spec, wav,mel, text
            ssl = row[0]
-            ssl_padded[i, :, :ssl.size(2)] = ssl[0, :, :]
+            ssl_padded[i, :, : ssl.size(2)] = ssl[0, :, :]
            ssl_lengths[i] = ssl.size(2)

            spec = row[1]
-            spec_padded[i, :, :spec.size(1)] = spec
+            spec_padded[i, :, : spec.size(1)] = spec
            spec_lengths[i] = spec.size(1)

            wav = row[2]
-            wav_padded[i, :, :wav.size(1)] = wav
+            wav_padded[i, :, : wav.size(1)] = wav
            wav_lengths[i] = wav.size(1)

            mel = row[3]
-            mel_padded[i, :, :mel.size(1)] = mel
+            mel_padded[i, :, : mel.size(1)] = mel
            mel_lengths[i] = mel.size(1)

            text = row[4]
-            text_padded[i, :text.size(0)] = text
+            text_padded[i, : text.size(0)] = text
            text_lengths[i] = text.size(0)

-        return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
+        return (
+            ssl_padded,
+            spec_padded,
+            mel_padded,
+            ssl_lengths,
+            spec_lengths,
+            text_padded,
+            text_lengths,
+            wav_padded,
+            wav_lengths,
+            mel_lengths,
+        )
        # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths,mel_lengths

+
 class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
    """
    Maintain similar input lengths in a batch.
@@ -745,12 +782,12 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
            num_samples_bucket = self.num_samples_per_bucket[i]

            rem = num_samples_bucket - len_bucket
-            ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
+            ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[: (rem % len_bucket)]

-            ids_bucket = ids_bucket[self.rank::self.num_replicas]
+            ids_bucket = ids_bucket[self.rank :: self.num_replicas]

            for j in range(len(ids_bucket) // self.batch_size):
-                batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
+                batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size : (j + 1) * self.batch_size]]
                batches.append(batch)

        if self.shuffle:
@@ -777,4 +814,4 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
            return -1

    def __len__(self):
-        return self.num_samples // self.batch_size
+        return self.num_samples // self.batch_size
--- a/GPT_SoVITS/module/losses.py
+++ b/GPT_SoVITS/module/losses.py
@@ -1,7 +1,6 @@
 import math

 import torch
-from torch.nn import functional as F


 def feature_loss(fmap_r, fmap_g):
@@ -66,8 +65,6 @@ def mle_loss(z, m, logs, logdet, mask):
        torch.exp(-2 * logs) * ((z - m) ** 2)
    )  # neg normal likelihood w/o the constant term
    l = l - torch.sum(logdet)  # log jacobian determinant
-    l = l / torch.sum(
-        torch.ones_like(z) * mask
-    )  # averaging across batch, channel and time axes
+    l = l / torch.sum(torch.ones_like(z) * mask)  # averaging across batch, channel and time axes
    l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
    return l
--- a/GPT_SoVITS/module/mel_processing.py
+++ b/GPT_SoVITS/module/mel_processing.py
@@ -1,16 +1,5 @@
-import math
-import os
-import random
 import torch
-from torch import nn
-import torch.nn.functional as F
 import torch.utils.data
-import numpy as np
-import librosa
-import librosa.util as librosa_util
-from librosa.util import normalize, pad_center, tiny
-from scipy.signal import get_window
-from scipy.io.wavfile import read
 from librosa.filters import mel as librosa_mel_fn

 MAX_WAV_VALUE = 32768.0
@@ -58,9 +47,7 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)
    dtype_device = str(y.dtype) + "_" + str(y.device)
    wnsize_dtype_device = str(win_size) + "_" + dtype_device
    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
-            dtype=y.dtype, device=y.device
-        )
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)

    y = torch.nn.functional.pad(
        y.unsqueeze(1),
@@ -90,20 +77,14 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
    dtype_device = str(spec.dtype) + "_" + str(spec.device)
    fmax_dtype_device = str(fmax) + "_" + dtype_device
    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(
-            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
-        )
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
-            dtype=spec.dtype, device=spec.device
-        )
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
    spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
    spec = spectral_normalize_torch(spec)
    return spec


-def mel_spectrogram_torch(
-    y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
-):
+def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.0:
        print("min value is ", torch.min(y))
    if torch.max(y) > 1.0:
@@ -114,16 +95,10 @@ def mel_spectrogram_torch(
    fmax_dtype_device = str(fmax) + "_" + dtype_device
    wnsize_dtype_device = str(win_size) + "_" + dtype_device
    if fmax_dtype_device not in mel_basis:
-        mel = librosa_mel_fn(
-            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
-        )
-        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
-            dtype=y.dtype, device=y.device
-        )
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
    if wnsize_dtype_device not in hann_window:
-        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
-            dtype=y.dtype, device=y.device
-        )
+        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)

    y = torch.nn.functional.pad(
        y.unsqueeze(1),
--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -1,9 +1,7 @@
 import warnings
+
 warnings.filterwarnings("ignore")
-import copy
 import math
-import os
-import pdb

 import torch
 from torch import nn
@@ -13,16 +11,18 @@ from module import commons
 from module import modules
 from module import attentions
 from f5_tts.model import DiT
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from module.commons import init_weights, get_padding
 from module.mrte_model import MRTE
 from module.quantize import ResidualVectorQuantizer
+
 # from text import symbols
 from text import symbols as symbols_v1
 from text import symbols2 as symbols_v2
 from torch.cuda.amp import autocast
-import contextlib,random
+import contextlib
+import random


 class StochasticDurationPredictor(nn.Module):
@@ -48,29 +48,21 @@ class StochasticDurationPredictor(nn.Module):
        self.flows = nn.ModuleList()
        self.flows.append(modules.ElementwiseAffine(2))
        for i in range(n_flows):
-            self.flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
+            self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
            self.flows.append(modules.Flip())

        self.post_pre = nn.Conv1d(1, filter_channels, 1)
        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
+        self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
        self.post_flows = nn.ModuleList()
        self.post_flows.append(modules.ElementwiseAffine(2))
        for i in range(4):
-            self.post_flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
+            self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
            self.post_flows.append(modules.Flip())

        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
+        self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
        if gin_channels != 0:
            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)

@@ -91,10 +83,7 @@ class StochasticDurationPredictor(nn.Module):
            h_w = self.post_pre(w)
            h_w = self.post_convs(h_w, x_mask)
            h_w = self.post_proj(h_w) * x_mask
-            e_q = (
-                torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
-                * x_mask
-            )
+            e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
            z_q = e_q
            for flow in self.post_flows:
                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
@@ -102,13 +91,8 @@ class StochasticDurationPredictor(nn.Module):
            z_u, z1 = torch.split(z_q, [1, 1], 1)
            u = torch.sigmoid(z_u) * x_mask
            z0 = (w - u) * x_mask
-            logdet_tot_q += torch.sum(
-                (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
-            )
-            logq = (
-                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
-                - logdet_tot_q
-            )
+            logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
+            logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q

            logdet_tot = 0
            z0, logdet = self.log_flow(z0, x_mask)
@@ -117,18 +101,12 @@ class StochasticDurationPredictor(nn.Module):
            for flow in flows:
                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
                logdet_tot = logdet_tot + logdet
-            nll = (
-                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
-                - logdet_tot
-            )
+            nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot
            return nll + logq  # [b]
        else:
            flows = list(reversed(self.flows))
            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-            z = (
-                torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
-                * noise_scale
-            )
+            z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
            for flow in flows:
                z = flow(z, x_mask, g=x, reverse=reverse)
            z0, z1 = torch.split(z, [1, 1], 1)
@@ -137,9 +115,7 @@ class StochasticDurationPredictor(nn.Module):


 class DurationPredictor(nn.Module):
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
        super().__init__()

        self.in_channels = in_channels
@@ -149,13 +125,9 @@ class DurationPredictor(nn.Module):
        self.gin_channels = gin_channels

        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
        self.norm_2 = modules.LayerNorm(filter_channels)
        self.proj = nn.Conv1d(filter_channels, 1, 1)

@@ -190,7 +162,7 @@ class TextEncoder(nn.Module):
        kernel_size,
        p_dropout,
        latent_channels=192,
-        version = "v2",
+        version="v2",
    ):
        super().__init__()
        self.out_channels = out_channels
@@ -237,26 +209,22 @@ class TextEncoder(nn.Module):

        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

-    def forward(self, y, y_lengths, text, text_lengths, ge, speed=1,test=None):
-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(
-            y.dtype
-        )
+    def forward(self, y, y_lengths, text, text_lengths, ge, speed=1, test=None):
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)

        y = self.ssl_proj(y * y_mask) * y_mask

        y = self.encoder_ssl(y * y_mask, y_mask)

-        text_mask = torch.unsqueeze(
-            commons.sequence_mask(text_lengths, text.size(1)), 1
-        ).to(y.dtype)
+        text_mask = torch.unsqueeze(commons.sequence_mask(text_lengths, text.size(1)), 1).to(y.dtype)
        if test == 1:
            text[:, :] = 0
        text = self.text_embedding(text).transpose(1, 2)
        text = self.encoder_text(text * text_mask, text_mask)
        y = self.mrte(y, y_mask, text, text_mask, ge)
        y = self.encoder2(y * y_mask, y_mask)
-        if(speed!=1):
-            y = F.interpolate(y, size=int(y.shape[-1] / speed)+1, mode="linear")
+        if speed != 1:
+            y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear")
            y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest")
        stats = self.proj(y) * y_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
@@ -360,9 +328,7 @@ class PosteriorEncoder(nn.Module):
    def forward(self, x, x_lengths, g=None):
        if g != None:
            g = g.detach()
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
@@ -372,14 +338,9 @@ class PosteriorEncoder(nn.Module):


 class Encoder(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 gin_channels=0):
+    def __init__(
+        self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0
+    ):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
@@ -394,7 +355,7 @@ class Encoder(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)

    def forward(self, x, x_lengths, g=None):
-        if(g!=None):
+        if g != None:
            g = g.detach()
        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
@@ -402,6 +363,7 @@ class Encoder(nn.Module):
        stats = self.proj(x) * x_mask
        return stats, x_mask

+
 class WNEncoder(nn.Module):
    def __init__(
        self,
@@ -434,9 +396,7 @@ class WNEncoder(nn.Module):
        self.norm = modules.LayerNorm(out_channels)

    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        out = self.proj(x) * x_mask
@@ -459,9 +419,7 @@ class Generator(torch.nn.Module):
        super(Generator, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2

        self.ups = nn.ModuleList()
@@ -481,9 +439,7 @@ class Generator(torch.nn.Module):
        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(resblock(ch, k, d))

        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
@@ -636,9 +592,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
        periods = [2, 3, 5, 7, 11]

        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
        self.discriminators = nn.ModuleList(discs)

    def forward(self, y, y_hat):
@@ -738,10 +692,7 @@ class Quantizer(torch.nn.Module):
        super(Quantizer, self).__init__()
        assert embed_dim % n_code_groups == 0
        self.quantizer_modules = nn.ModuleList(
-            [
-                Quantizer_module(n_codes, embed_dim // n_code_groups)
-                for _ in range(n_code_groups)
-            ]
+            [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)]
        )
        self.n_code_groups = n_code_groups
        self.embed_dim = embed_dim
@@ -759,9 +710,7 @@ class Quantizer(torch.nn.Module):
            z_q.append(_z_q)
            min_indicies.append(_min_indicies)  # B * T,
        z_q = torch.cat(z_q, -1).reshape(xin.shape)
-        loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean(
-            (z_q - xin.detach()) ** 2
-        )
+        loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
        z_q = xin + (z_q - xin).detach()
        z_q = z_q.transpose(1, 2)
        codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups)
@@ -801,13 +750,9 @@ class CodePredictor(nn.Module):
        self.p_dropout = p_dropout

        self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1)
-        self.ref_enc = modules.MelStyleEncoder(
-            ssl_dim, style_vector_dim=hidden_channels
-        )
+        self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels)

-        self.encoder = attentions.Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
+        self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)

        self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1)
        self.n_q = n_q
@@ -820,9 +765,7 @@ class CodePredictor(nn.Module):
        x = x + g
        x = self.encoder(x * x_mask, x_mask)
        x = self.out_proj(x * x_mask) * x_mask
-        logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(
-            2, 3
-        )
+        logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3)
        target = codes[1:].transpose(0, 1)
        if not infer:
            logits = logits.reshape(-1, self.dims)
@@ -870,8 +813,8 @@ class SynthesizerTrn(nn.Module):
        use_sdp=True,
        semantic_frame_rate=None,
        freeze_quantizer=None,
-        version = "v2",
-        **kwargs
+        version="v2",
+        **kwargs,
    ):
        super().__init__()
        self.spec_channels = spec_channels
@@ -902,7 +845,7 @@ class SynthesizerTrn(nn.Module):
            n_layers,
            kernel_size,
            p_dropout,
-            version = version,
+            version=version,
        )
        self.dec = Generator(
            inter_channels,
@@ -923,12 +866,10 @@ class SynthesizerTrn(nn.Module):
            16,
            gin_channels=gin_channels,
        )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
-        )
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)

        # self.version=os.environ.get("version","v1")
-        if(self.version=="v1"):
+        if self.version == "v1":
            self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)
        else:
            self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)
@@ -945,13 +886,11 @@ class SynthesizerTrn(nn.Module):
        self.freeze_quantizer = freeze_quantizer

    def forward(self, ssl, y, y_lengths, text, text_lengths):
-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(
-            y.dtype
-        )
-        if(self.version=="v1"):
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+        if self.version == "v1":
            ge = self.ref_enc(y * y_mask, y_mask)
        else:
-            ge = self.ref_enc(y[:,:704] * y_mask, y_mask)
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
        with autocast(enabled=False):
            maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
            with maybe_no_grad:
@@ -959,24 +898,16 @@ class SynthesizerTrn(nn.Module):
                    self.ssl_proj.eval()
                    self.quantizer.eval()
            ssl = self.ssl_proj(ssl)
-            quantized, codes, commit_loss, quantized_list = self.quantizer(
-                ssl, layers=[0]
-            )
+            quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0])

        if self.semantic_frame_rate == "25hz":
-            quantized = F.interpolate(
-                quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
-            )
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")

-        x, m_p, logs_p, y_mask = self.enc_p(
-            quantized, y_lengths, text, text_lengths, ge
-        )
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
        z_p = self.flow(z, y_mask, g=ge)

-        z_slice, ids_slice = commons.rand_slice_segments(
-            z, y_lengths, self.segment_size
-        )
+        z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
        o = self.dec(z_slice, g=ge)
        return (
            o,
@@ -989,24 +920,18 @@ class SynthesizerTrn(nn.Module):
        )

    def infer(self, ssl, y, y_lengths, text, text_lengths, test=None, noise_scale=0.5):
-        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(
-            y.dtype
-        )
-        if(self.version=="v1"):
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
+        if self.version == "v1":
            ge = self.ref_enc(y * y_mask, y_mask)
        else:
-            ge = self.ref_enc(y[:,:704] * y_mask, y_mask)
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)

        ssl = self.ssl_proj(ssl)
        quantized, codes, commit_loss, _ = self.quantizer(ssl, layers=[0])
        if self.semantic_frame_rate == "25hz":
-            quantized = F.interpolate(
-                quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
-            )
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")

-        x, m_p, logs_p, y_mask = self.enc_p(
-            quantized, y_lengths, text, text_lengths, ge, test=test
-        )
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, test=test)
        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale

        z = self.flow(z_p, y_mask, g=ge, reverse=True)
@@ -1015,39 +940,34 @@ class SynthesizerTrn(nn.Module):
        return o, y_mask, (z, z_p, m_p, logs_p)

    @torch.no_grad()
-    def decode(self, codes, text, refer, noise_scale=0.5,speed=1):
+    def decode(self, codes, text, refer, noise_scale=0.5, speed=1):
        def get_ge(refer):
            ge = None
            if refer is not None:
                refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
-                refer_mask = torch.unsqueeze(
-                    commons.sequence_mask(refer_lengths, refer.size(2)), 1
-                ).to(refer.dtype)
-                if (self.version == "v1"):
+                refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
+                if self.version == "v1":
                    ge = self.ref_enc(refer * refer_mask, refer_mask)
                else:
                    ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
            return ge
-        if(type(refer)==list):
-            ges=[]
+
+        if type(refer) == list:
+            ges = []
            for _refer in refer:
-                ge=get_ge(_refer)
+                ge = get_ge(_refer)
                ges.append(ge)
-            ge=torch.stack(ges,0).mean(0)
+            ge = torch.stack(ges, 0).mean(0)
        else:
-            ge=get_ge(refer)
+            ge = get_ge(refer)

        y_lengths = torch.LongTensor([codes.size(2) * 2]).to(codes.device)
        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)

        quantized = self.quantizer.decode(codes)
        if self.semantic_frame_rate == "25hz":
-            quantized = F.interpolate(
-                quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
-            )
-        x, m_p, logs_p, y_mask = self.enc_p(
-            quantized, y_lengths, text, text_lengths, ge,speed
-        )
+            quantized = F.interpolate(quantized, size=int(quantized.shape[-1] * 2), mode="nearest")
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale

        z = self.flow(z_p, y_mask, g=ge, reverse=True)
@@ -1059,11 +979,10 @@ class SynthesizerTrn(nn.Module):
        ssl = self.ssl_proj(x)
        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
        return codes.transpose(0, 1)
+
+
 class CFM(torch.nn.Module):
-    def __init__(
-        self,
-        in_channels,dit
-    ):
+    def __init__(self, in_channels, dit):
        super().__init__()
        self.sigma_min = 1e-6

@@ -1077,41 +996,54 @@ class CFM(torch.nn.Module):
    def inference(self, mu, x_lens, prompt, n_timesteps, temperature=1.0, inference_cfg_rate=0):
        """Forward diffusion"""
        B, T = mu.size(0), mu.size(1)
-        x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype) * temperature
+        x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype) * temperature
        prompt_len = prompt.size(-1)
-        prompt_x = torch.zeros_like(x,dtype=mu.dtype)
+        prompt_x = torch.zeros_like(x, dtype=mu.dtype)
        prompt_x[..., :prompt_len] = prompt[..., :prompt_len]
        x[..., :prompt_len] = 0
-        mu=mu.transpose(2,1)
+        mu = mu.transpose(2, 1)
        t = 0
        d = 1 / n_timesteps
        for j in range(n_timesteps):
-            t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t
-            d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d
+            t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
+            d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d
            # v_pred = model(x, t_tensor, d_tensor, **extra_args)
-            v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu, use_grad_ckpt=False,drop_audio_cond=False,drop_text=False).transpose(2, 1)
-            if inference_cfg_rate>1e-5:
-                neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1)
-                v_pred=v_pred+(v_pred-neg)*inference_cfg_rate
+            v_pred = self.estimator(
+                x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=False, drop_text=False
+            ).transpose(2, 1)
+            if inference_cfg_rate > 1e-5:
+                neg = self.estimator(
+                    x,
+                    prompt_x,
+                    x_lens,
+                    t_tensor,
+                    d_tensor,
+                    mu,
+                    use_grad_ckpt=False,
+                    drop_audio_cond=True,
+                    drop_text=True,
+                ).transpose(2, 1)
+                v_pred = v_pred + (v_pred - neg) * inference_cfg_rate
            x = x + d * v_pred
            t = t + d
            x[:, :, :prompt_len] = 0
        return x
+
    def forward(self, x1, x_lens, prompt_lens, mu, use_grad_ckpt):
        b, _, t = x1.shape
        t = torch.rand([b], device=mu.device, dtype=x1.dtype)
-        x0 = torch.randn_like(x1,device=mu.device)
+        x0 = torch.randn_like(x1, device=mu.device)
        vt = x1 - x0
        xt = x0 + t[:, None, None] * vt
-        dt = torch.zeros_like(t,device=mu.device)
+        dt = torch.zeros_like(t, device=mu.device)
        prompt = torch.zeros_like(x1)
        for i in range(b):
-            prompt[i, :, :prompt_lens[i]] = x1[i, :, :prompt_lens[i]]
-            xt[i, :, :prompt_lens[i]] = 0
-        gailv=0.3# if ttime()>1736250488 else 0.1
+            prompt[i, :, : prompt_lens[i]] = x1[i, :, : prompt_lens[i]]
+            xt[i, :, : prompt_lens[i]] = 0
+        gailv = 0.3  # if ttime()>1736250488 else 0.1
        if random.random() < gailv:
            base = torch.randint(2, 8, (t.shape[0],), device=mu.device)
-            d = 1/torch.pow(2, base)
+            d = 1 / torch.pow(2, base)
            d_input = d.clone()
            d_input[d_input < 1e-2] = 0
            # with torch.no_grad():
@@ -1119,52 +1051,55 @@ class CFM(torch.nn.Module):
            # v_pred_1 = self.diffusion(xt, t, d_input, cond=conditioning).detach()
            x_mid = xt + d[:, None, None] * v_pred_1
            # v_pred_2 = self.diffusion(x_mid, t+d, d_input, cond=conditioning).detach()
-            v_pred_2 = self.estimator(x_mid, prompt, x_lens, t+d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach()
+            v_pred_2 = self.estimator(x_mid, prompt, x_lens, t + d, d_input, mu, use_grad_ckpt).transpose(2, 1).detach()
            vt = (v_pred_1 + v_pred_2) / 2
            vt = vt.detach()
-            dt = 2*d
+            dt = 2 * d

-        vt_pred = self.estimator(xt, prompt, x_lens, t,dt, mu, use_grad_ckpt).transpose(2,1)
+        vt_pred = self.estimator(xt, prompt, x_lens, t, dt, mu, use_grad_ckpt).transpose(2, 1)
        loss = 0
        for i in range(b):
-            loss += self.criterion(vt_pred[i, :, prompt_lens[i]:x_lens[i]], vt[i, :, prompt_lens[i]:x_lens[i]])
+            loss += self.criterion(vt_pred[i, :, prompt_lens[i] : x_lens[i]], vt[i, :, prompt_lens[i] : x_lens[i]])
        loss /= b

        return loss

+
 def set_no_grad(net_g):
    for name, param in net_g.named_parameters():
-        param.requires_grad=False
+        param.requires_grad = False
+

 class SynthesizerTrnV3(nn.Module):
    """
    Synthesizer for Training
    """

-    def __init__(self,
-                 spec_channels,
-                 segment_size,
-                 inter_channels,
-                 hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
-                 kernel_size,
-                 p_dropout,
-                 resblock,
-                 resblock_kernel_sizes,
-                 resblock_dilation_sizes,
-                 upsample_rates,
-                 upsample_initial_channel,
-                 upsample_kernel_sizes,
-                 n_speakers=0,
-                 gin_channels=0,
-                 use_sdp=True,
-                 semantic_frame_rate=None,
-                 freeze_quantizer=None,
-                 version="v3",
-                 **kwargs):
-
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=0,
+        gin_channels=0,
+        use_sdp=True,
+        semantic_frame_rate=None,
+        freeze_quantizer=None,
+        version="v3",
+        **kwargs,
+    ):
        super().__init__()
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@@ -1185,132 +1120,133 @@ class SynthesizerTrnV3(nn.Module):
        self.gin_channels = gin_channels
        self.version = version

-        self.model_dim=512
+        self.model_dim = 512
        self.use_sdp = use_sdp
-        self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout)
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
        # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback
-        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback
+        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)  ###Rollback
        # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
        #                      upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
        # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
        #                               gin_channels=gin_channels)
        # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)

-
        ssl_dim = 768
-        assert semantic_frame_rate in ['25hz', "50hz"]
+        assert semantic_frame_rate in ["25hz", "50hz"]
        self.semantic_frame_rate = semantic_frame_rate
-        if semantic_frame_rate == '25hz':
+        if semantic_frame_rate == "25hz":
            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
        else:
            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)

-        self.quantizer = ResidualVectorQuantizer(
-            dimension=ssl_dim,
-            n_q=1,
-            bins=1024
-        )
-        self.freeze_quantizer=freeze_quantizer
-        inter_channels2=512
-        self.bridge=nn.Sequential(
-            nn.Conv1d(inter_channels, inter_channels2, 1, stride=1),
-            nn.LeakyReLU()
-        )
-        self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels)
-        self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1)
-        self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim
-        if self.freeze_quantizer==True:
+        self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
+        self.freeze_quantizer = freeze_quantizer
+        inter_channels2 = 512
+        self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU())
+        self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels)
+        self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1)
+        self.cfm = CFM(
+            100,
+            DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),
+        )  # text_dim is condition feature dim
+        if self.freeze_quantizer == True:
            set_no_grad(self.ssl_proj)
            set_no_grad(self.quantizer)
            set_no_grad(self.enc_p)

-    def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths, use_grad_ckpt):#ssl_lengths no need now
+    def forward(
+        self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths, use_grad_ckpt
+    ):  # ssl_lengths no need now
        with autocast(enabled=False):
            y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
-            ge = self.ref_enc(y[:,:704] * y_mask, y_mask)
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
            maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
            with maybe_no_grad:
                if self.freeze_quantizer:
-                    self.ssl_proj.eval()#
+                    self.ssl_proj.eval()  #
                    self.quantizer.eval()
                    self.enc_p.eval()
                ssl = self.ssl_proj(ssl)
-                quantized, codes, commit_loss, quantized_list = self.quantizer(
-                    ssl, layers=[0]
-                )
-                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
+                quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0])
+                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
                x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
-        fea=self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
-        fea, y_mask_ = self.wns1(fea, mel_lengths, ge)##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
-        B=ssl.shape[0]
-        prompt_len_max = mel_lengths*2/3
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
+        fea, y_mask_ = self.wns1(
+            fea, mel_lengths, ge
+        )  ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
+        B = ssl.shape[0]
+        prompt_len_max = mel_lengths * 2 / 3
        prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)
-        minn=min(mel.shape[-1],fea.shape[-1])
-        mel=mel[:,:,:minn]
-        fea=fea[:,:,:minn]
-        cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt)
+        minn = min(mel.shape[-1], fea.shape[-1])
+        mel = mel[:, :, :minn]
+        fea = fea[:, :, :minn]
+        cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea, use_grad_ckpt)
        return cfm_loss

    @torch.no_grad()
-    def decode_encp(self, codes,text, refer,ge=None,speed=1):
+    def decode_encp(self, codes, text, refer, ge=None, speed=1):
        # print(2333333,refer.shape)
        # ge=None
-        if(ge==None):
+        if ge == None:
            refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
            refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
-            ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask)
-        y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device)
-        if speed==1:
-            sizee=int(codes.size(2)*2.5*1.5)
+            ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
+        y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
+        if speed == 1:
+            sizee = int(codes.size(2) * 2.5 * 1.5)
        else:
-            sizee=int(codes.size(2)*2.5*1.5/speed)+1
+            sizee = int(codes.size(2) * 2.5 * 1.5 / speed) + 1
        y_lengths1 = torch.LongTensor([sizee]).to(codes.device)
        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)

        quantized = self.quantizer.decode(codes)
-        if self.semantic_frame_rate == '25hz':
-            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
-        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge,speed)
-        fea=self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
        ####more wn paramter to learn mel
        fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
-        return fea,ge
+        return fea, ge

    def extract_latent(self, x):
-        ssl =  self.ssl_proj(x)
+        ssl = self.ssl_proj(x)
        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
-        return codes.transpose(0,1)
+        return codes.transpose(0, 1)
+

 class SynthesizerTrnV3b(nn.Module):
    """
    Synthesizer for Training
    """

-    def __init__(self,
-                 spec_channels,
-                 segment_size,
-                 inter_channels,
-                 hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
-                 kernel_size,
-                 p_dropout,
-                 resblock,
-                 resblock_kernel_sizes,
-                 resblock_dilation_sizes,
-                 upsample_rates,
-                 upsample_initial_channel,
-                 upsample_kernel_sizes,
-                 n_speakers=0,
-                 gin_channels=0,
-                 use_sdp=True,
-                 semantic_frame_rate=None,
-                 freeze_quantizer=None,
-                 **kwargs):
-
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=0,
+        gin_channels=0,
+        use_sdp=True,
+        semantic_frame_rate=None,
+        freeze_quantizer=None,
+        **kwargs,
+    ):
        super().__init__()
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@@ -1330,47 +1266,52 @@ class SynthesizerTrnV3b(nn.Module):
        self.n_speakers = n_speakers
        self.gin_channels = gin_channels

-        self.model_dim=512
+        self.model_dim = 512
        self.use_sdp = use_sdp
-        self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout)
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
        # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback
-        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback
-        self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
-                             upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
-        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
-                                      gin_channels=gin_channels)
+        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)  ###Rollback
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels
+        )
        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)

-
        ssl_dim = 768
-        assert semantic_frame_rate in ['25hz', "50hz"]
+        assert semantic_frame_rate in ["25hz", "50hz"]
        self.semantic_frame_rate = semantic_frame_rate
-        if semantic_frame_rate == '25hz':
+        if semantic_frame_rate == "25hz":
            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
        else:
            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)

-        self.quantizer = ResidualVectorQuantizer(
-            dimension=ssl_dim,
-            n_q=1,
-            bins=1024
-        )
-        self.freeze_quantizer=freeze_quantizer
+        self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
+        self.freeze_quantizer = freeze_quantizer

-        inter_channels2=512
-        self.bridge=nn.Sequential(
-            nn.Conv1d(inter_channels, inter_channels2, 1, stride=1),
-            nn.LeakyReLU()
-        )
-        self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels)
-        self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1)
-        self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim
+        inter_channels2 = 512
+        self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU())
+        self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels)
+        self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1)
+        self.cfm = CFM(
+            100,
+            DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),
+        )  # text_dim is condition feature dim

-
-    def forward(self, ssl, y, mel,ssl_lengths,y_lengths, text, text_lengths,mel_lengths):#ssl_lengths no need now
+    def forward(self, ssl, y, mel, ssl_lengths, y_lengths, text, text_lengths, mel_lengths):  # ssl_lengths no need now
        with autocast(enabled=False):
            y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(y.dtype)
-            ge = self.ref_enc(y[:,:704] * y_mask, y_mask)
+            ge = self.ref_enc(y[:, :704] * y_mask, y_mask)
            # ge = self.ref_enc(y * y_mask, y_mask)#change back, new spec setting is whole 24k
            # ge=None
            maybe_no_grad = torch.no_grad() if self.freeze_quantizer else contextlib.nullcontext()
@@ -1379,51 +1320,59 @@ class SynthesizerTrnV3b(nn.Module):
                    self.ssl_proj.eval()
                    self.quantizer.eval()
                ssl = self.ssl_proj(ssl)
-                quantized, codes, commit_loss, quantized_list = self.quantizer(
-                    ssl, layers=[0]
-                )
-                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
+                quantized, codes, commit_loss, quantized_list = self.quantizer(ssl, layers=[0])
+                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
                x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=ge)
        z_p = self.flow(z, y_mask, g=ge)
        z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
        o = self.dec(z_slice, g=ge)
-        fea=self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
        fea, y_mask_ = self.wns1(fea, mel_lengths, ge)
        learned_mel = self.linear_mel(fea)
-        B=ssl.shape[0]
-        prompt_len_max = mel_lengths*2/3
-        prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)#
-        minn=min(mel.shape[-1],fea.shape[-1])
-        mel=mel[:,:,:minn]
-        fea=fea[:,:,:minn]
-        cfm_loss= self.cfm(mel, mel_lengths, prompt_len, fea)#fea==cond,y_lengths==target_mel_lengths#ge not need
-        return commit_loss,cfm_loss,F.mse_loss(learned_mel, mel),o, ids_slice, y_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q), quantized
+        B = ssl.shape[0]
+        prompt_len_max = mel_lengths * 2 / 3
+        prompt_len = (torch.rand([B], device=fea.device) * prompt_len_max).floor().to(dtype=torch.long)  #
+        minn = min(mel.shape[-1], fea.shape[-1])
+        mel = mel[:, :, :minn]
+        fea = fea[:, :, :minn]
+        cfm_loss = self.cfm(mel, mel_lengths, prompt_len, fea)  # fea==cond,y_lengths==target_mel_lengths#ge not need
+        return (
+            commit_loss,
+            cfm_loss,
+            F.mse_loss(learned_mel, mel),
+            o,
+            ids_slice,
+            y_mask,
+            y_mask,
+            (z, z_p, m_p, logs_p, m_q, logs_q),
+            quantized,
+        )

    @torch.no_grad()
-    def decode_encp(self, codes,text, refer,ge=None):
+    def decode_encp(self, codes, text, refer, ge=None):
        # print(2333333,refer.shape)
        # ge=None
-        if(ge==None):
+        if ge == None:
            refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
            refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
-            ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask)
-        y_lengths = torch.LongTensor([int(codes.size(2)*2)]).to(codes.device)
-        y_lengths1 = torch.LongTensor([int(codes.size(2)*2.5*1.5)]).to(codes.device)
+            ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
+        y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
+        y_lengths1 = torch.LongTensor([int(codes.size(2) * 2.5 * 1.5)]).to(codes.device)
        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)

        quantized = self.quantizer.decode(codes)
-        if self.semantic_frame_rate == '25hz':
-            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
-        fea=self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
        ####more wn paramter to learn mel
        fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
-        return fea,ge
+        return fea, ge

    def extract_latent(self, x):
-        ssl =  self.ssl_proj(x)
+        ssl = self.ssl_proj(x)
        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
-        return codes.transpose(0,1)
+        return codes.transpose(0, 1)
--- a/GPT_SoVITS/module/models_onnx.py
+++ b/GPT_SoVITS/module/models_onnx.py
@@ -1,4 +1,3 @@
-import copy
 import math
 from typing import Optional
 import torch
@@ -11,14 +10,14 @@ from module import attentions_onnx as attentions

 from f5_tts.model import DiT

-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from module.commons import init_weights, get_padding
 from module.quantize import ResidualVectorQuantizer
+
 # from text import symbols
 from text import symbols as symbols_v1
 from text import symbols2 as symbols_v2
-from torch.cuda.amp import autocast


 class StochasticDurationPredictor(nn.Module):
@@ -44,29 +43,21 @@ class StochasticDurationPredictor(nn.Module):
        self.flows = nn.ModuleList()
        self.flows.append(modules.ElementwiseAffine(2))
        for i in range(n_flows):
-            self.flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
+            self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
            self.flows.append(modules.Flip())

        self.post_pre = nn.Conv1d(1, filter_channels, 1)
        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.post_convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
+        self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
        self.post_flows = nn.ModuleList()
        self.post_flows.append(modules.ElementwiseAffine(2))
        for i in range(4):
-            self.post_flows.append(
-                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
-            )
+            self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
            self.post_flows.append(modules.Flip())

        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.convs = modules.DDSConv(
-            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
-        )
+        self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
        if gin_channels != 0:
            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)

@@ -87,10 +78,7 @@ class StochasticDurationPredictor(nn.Module):
            h_w = self.post_pre(w)
            h_w = self.post_convs(h_w, x_mask)
            h_w = self.post_proj(h_w) * x_mask
-            e_q = (
-                torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
-                * x_mask
-            )
+            e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
            z_q = e_q
            for flow in self.post_flows:
                z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
@@ -98,13 +86,8 @@ class StochasticDurationPredictor(nn.Module):
            z_u, z1 = torch.split(z_q, [1, 1], 1)
            u = torch.sigmoid(z_u) * x_mask
            z0 = (w - u) * x_mask
-            logdet_tot_q += torch.sum(
-                (F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
-            )
-            logq = (
-                torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2])
-                - logdet_tot_q
-            )
+            logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
+            logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q

            logdet_tot = 0
            z0, logdet = self.log_flow(z0, x_mask)
@@ -113,18 +96,12 @@ class StochasticDurationPredictor(nn.Module):
            for flow in flows:
                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
                logdet_tot = logdet_tot + logdet
-            nll = (
-                torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2])
-                - logdet_tot
-            )
+            nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask, [1, 2]) - logdet_tot
            return nll + logq  # [b]
        else:
            flows = list(reversed(self.flows))
            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-            z = (
-                torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
-                * noise_scale
-            )
+            z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
            for flow in flows:
                z = flow(z, x_mask, g=x, reverse=reverse)
            z0, z1 = torch.split(z, [1, 1], 1)
@@ -133,9 +110,7 @@ class StochasticDurationPredictor(nn.Module):


 class DurationPredictor(nn.Module):
-    def __init__(
-        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
-    ):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
        super().__init__()

        self.in_channels = in_channels
@@ -145,13 +120,9 @@ class DurationPredictor(nn.Module):
        self.gin_channels = gin_channels

        self.drop = nn.Dropout(p_dropout)
-        self.conv_1 = nn.Conv1d(
-            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
        self.norm_1 = modules.LayerNorm(filter_channels)
-        self.conv_2 = nn.Conv1d(
-            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
-        )
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
        self.norm_2 = modules.LayerNorm(filter_channels)
        self.proj = nn.Conv1d(filter_channels, 1, 1)

@@ -234,7 +205,7 @@ class TextEncoder(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)

    def forward(self, y, text, ge, speed=1):
-        y_mask = torch.ones_like(y[:1,:1,:])
+        y_mask = torch.ones_like(y[:1, :1, :])

        y = self.ssl_proj(y * y_mask) * y_mask
        y = self.encoder_ssl(y * y_mask, y_mask)
@@ -246,8 +217,8 @@ class TextEncoder(nn.Module):
        y = self.mrte(y, y_mask, text, text_mask, ge)

        y = self.encoder2(y * y_mask, y_mask)
-        if(speed!=1):
-            y = F.interpolate(y, size=int(y.shape[-1] / speed)+1, mode="linear")
+        if speed != 1:
+            y = F.interpolate(y, size=int(y.shape[-1] / speed) + 1, mode="linear")
            y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest")

        stats = self.proj(y) * y_mask
@@ -333,9 +304,7 @@ class PosteriorEncoder(nn.Module):
    def forward(self, x, x_lengths, g=None):
        if g != None:
            g = g.detach()
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
@@ -345,14 +314,9 @@ class PosteriorEncoder(nn.Module):


 class Encoder(nn.Module):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 hidden_channels,
-                 kernel_size,
-                 dilation_rate,
-                 n_layers,
-                 gin_channels=0):
+    def __init__(
+        self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0
+    ):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
@@ -367,7 +331,7 @@ class Encoder(nn.Module):
        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)

    def forward(self, x, x_lengths, g=None):
-        if(g!=None):
+        if g != None:
            g = g.detach()
        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
@@ -375,6 +339,7 @@ class Encoder(nn.Module):
        stats = self.proj(x) * x_mask
        return stats, x_mask

+
 class WNEncoder(nn.Module):
    def __init__(
        self,
@@ -407,9 +372,7 @@ class WNEncoder(nn.Module):
        self.norm = modules.LayerNorm(out_channels)

    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
-            x.dtype
-        )
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        out = self.proj(x) * x_mask
@@ -432,9 +395,7 @@ class Generator(torch.nn.Module):
        super(Generator, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
        self.num_upsamples = len(upsample_rates)
-        self.conv_pre = Conv1d(
-            initial_channel, upsample_initial_channel, 7, 1, padding=3
-        )
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2

        self.ups = nn.ModuleList()
@@ -454,9 +415,7 @@ class Generator(torch.nn.Module):
        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(resblock_kernel_sizes, resblock_dilation_sizes)
-            ):
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(resblock(ch, k, d))

        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
@@ -465,7 +424,7 @@ class Generator(torch.nn.Module):
        if gin_channels != 0:
            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)

-    def forward(self, x, g:Optional[torch.Tensor]=None):
+    def forward(self, x, g: Optional[torch.Tensor] = None):
        x = self.conv_pre(x)
        if g is not None:
            x = x + self.cond(g)
@@ -609,9 +568,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
        periods = [2, 3, 5, 7, 11]

        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
-        discs = discs + [
-            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
-        ]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
        self.discriminators = nn.ModuleList(discs)

    def forward(self, y, y_hat):
@@ -711,10 +668,7 @@ class Quantizer(torch.nn.Module):
        super(Quantizer, self).__init__()
        assert embed_dim % n_code_groups == 0
        self.quantizer_modules = nn.ModuleList(
-            [
-                Quantizer_module(n_codes, embed_dim // n_code_groups)
-                for _ in range(n_code_groups)
-            ]
+            [Quantizer_module(n_codes, embed_dim // n_code_groups) for _ in range(n_code_groups)]
        )
        self.n_code_groups = n_code_groups
        self.embed_dim = embed_dim
@@ -732,9 +686,7 @@ class Quantizer(torch.nn.Module):
            z_q.append(_z_q)
            min_indicies.append(_min_indicies)  # B * T,
        z_q = torch.cat(z_q, -1).reshape(xin.shape)
-        loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean(
-            (z_q - xin.detach()) ** 2
-        )
+        loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
        z_q = xin + (z_q - xin).detach()
        z_q = z_q.transpose(1, 2)
        codes = torch.stack(min_indicies, -1).reshape(B, T, self.n_code_groups)
@@ -774,13 +726,9 @@ class CodePredictor(nn.Module):
        self.p_dropout = p_dropout

        self.vq_proj = nn.Conv1d(ssl_dim, hidden_channels, 1)
-        self.ref_enc = modules.MelStyleEncoder(
-            ssl_dim, style_vector_dim=hidden_channels
-        )
+        self.ref_enc = modules.MelStyleEncoder(ssl_dim, style_vector_dim=hidden_channels)

-        self.encoder = attentions.Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
-        )
+        self.encoder = attentions.Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)

        self.out_proj = nn.Conv1d(hidden_channels, (n_q - 1) * dims, 1)
        self.n_q = n_q
@@ -793,9 +741,7 @@ class CodePredictor(nn.Module):
        x = x + g
        x = self.encoder(x * x_mask, x_mask)
        x = self.out_proj(x * x_mask) * x_mask
-        logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(
-            2, 3
-        )
+        logits = x.reshape(x.shape[0], self.n_q - 1, self.dims, x.shape[-1]).transpose(2, 3)
        target = codes[1:].transpose(0, 1)
        if not infer:
            logits = logits.reshape(-1, self.dims)
@@ -844,7 +790,7 @@ class SynthesizerTrn(nn.Module):
        semantic_frame_rate=None,
        freeze_quantizer=None,
        version="v2",
-        **kwargs
+        **kwargs,
    ):
        super().__init__()
        self.spec_channels = spec_channels
@@ -896,9 +842,7 @@ class SynthesizerTrn(nn.Module):
        #     16,
        #     gin_channels=gin_channels,
        # )
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels
-        )
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)

        # self.version=os.environ.get("version","v1")
        if self.version == "v1":
@@ -923,9 +867,9 @@ class SynthesizerTrn(nn.Module):
            # self.enc_p.encoder_text.requires_grad_(False)
            # self.enc_p.mrte.requires_grad_(False)

-    def forward(self, codes, text, refer,noise_scale=0.5, speed=1):
-        refer_mask = torch.ones_like(refer[:1,:1,:])
-        if (self.version == "v1"):
+    def forward(self, codes, text, refer, noise_scale=0.5, speed=1):
+        refer_mask = torch.ones_like(refer[:1, :1, :])
+        if self.version == "v1":
            ge = self.ref_enc(refer * refer_mask, refer_mask)
        else:
            ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
@@ -935,10 +879,8 @@ class SynthesizerTrn(nn.Module):
            dquantized = torch.cat([quantized, quantized]).permute(1, 2, 0)
            quantized = dquantized.contiguous().view(1, self.ssl_dim, -1)

-        x, m_p, logs_p, y_mask = self.enc_p(
-            quantized, text, ge, speed
-        )
-        
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed)
+
        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale

        z = self.flow(z_p, y_mask, g=ge, reverse=True)
@@ -951,11 +893,9 @@ class SynthesizerTrn(nn.Module):
        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
        return codes.transpose(0, 1)

+
 class CFM(torch.nn.Module):
-    def __init__(
-        self,
-        in_channels,dit
-    ):
+    def __init__(self, in_channels, dit):
        super().__init__()
        # self.sigma_min = 1e-6

@@ -965,27 +905,34 @@ class CFM(torch.nn.Module):

        # self.criterion = torch.nn.MSELoss()

-    def forward(self, mu:torch.Tensor, x_lens:torch.LongTensor, prompt:torch.Tensor, n_timesteps:torch.LongTensor, temperature:float=1.0):
+    def forward(
+        self,
+        mu: torch.Tensor,
+        x_lens: torch.LongTensor,
+        prompt: torch.Tensor,
+        n_timesteps: torch.LongTensor,
+        temperature: float = 1.0,
+    ):
        """Forward diffusion"""
        B, T = mu.size(0), mu.size(1)
-        x = torch.randn([B, self.in_channels, T], device=mu.device,dtype=mu.dtype)
+        x = torch.randn([B, self.in_channels, T], device=mu.device, dtype=mu.dtype)

        ntimesteps = int(n_timesteps)

        prompt_len = prompt.size(-1)
-        prompt_x = torch.zeros_like(x,dtype=mu.dtype)
+        prompt_x = torch.zeros_like(x, dtype=mu.dtype)
        prompt_x[..., :prompt_len] = prompt[..., :prompt_len]
        x[..., :prompt_len] = 0.0
-        mu=mu.transpose(2,1)
-        t = torch.tensor(0.0,dtype=x.dtype,device=x.device)
-        d = torch.tensor(1.0/ntimesteps,dtype=x.dtype,device=x.device)
-        d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d
+        mu = mu.transpose(2, 1)
+        t = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        d = torch.tensor(1.0 / ntimesteps, dtype=x.dtype, device=x.device)
+        d_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * d

        for j in range(ntimesteps):
-            t_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * t
+            t_tensor = torch.ones(x.shape[0], device=x.device, dtype=mu.dtype) * t
            # d_tensor = torch.ones(x.shape[0], device=x.device,dtype=mu.dtype) * d
            # v_pred = model(x, t_tensor, d_tensor, **extra_args)
-            v_pred = self.estimator(x, prompt_x, x_lens, t_tensor,d_tensor, mu).transpose(2, 1)
+            v_pred = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu).transpose(2, 1)
            # if inference_cfg_rate>1e-5:
            #     neg = self.estimator(x, prompt_x, x_lens, t_tensor, d_tensor, mu, use_grad_ckpt=False, drop_audio_cond=True, drop_text=True).transpose(2, 1)
            #     v_pred=v_pred+(v_pred-neg)*inference_cfg_rate
@@ -997,47 +944,51 @@ class CFM(torch.nn.Module):

 def set_no_grad(net_g):
    for name, param in net_g.named_parameters():
-        param.requires_grad=False
+        param.requires_grad = False
+

@torch.jit.script_if_tracing
 def compile_codes_length(codes):
    y_lengths1 = torch.LongTensor([codes.size(2)]).to(codes.device)
    return y_lengths1 * 2.5 * 1.5

+
@torch.jit.script_if_tracing
 def compile_ref_length(refer):
    refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
    return refer_lengths

+
 class SynthesizerTrnV3(nn.Module):
    """
    Synthesizer for Training
    """

-    def __init__(self,
-                 spec_channels,
-                 segment_size,
-                 inter_channels,
-                 hidden_channels,
-                 filter_channels,
-                 n_heads,
-                 n_layers,
-                 kernel_size,
-                 p_dropout,
-                 resblock,
-                 resblock_kernel_sizes,
-                 resblock_dilation_sizes,
-                 upsample_rates,
-                 upsample_initial_channel,
-                 upsample_kernel_sizes,
-                 n_speakers=0,
-                 gin_channels=0,
-                 use_sdp=True,
-                 semantic_frame_rate=None,
-                 freeze_quantizer=None,
-                 version="v3",
-                 **kwargs):
-
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=0,
+        gin_channels=0,
+        use_sdp=True,
+        semantic_frame_rate=None,
+        freeze_quantizer=None,
+        version="v3",
+        **kwargs,
+    ):
        super().__init__()
        self.spec_channels = spec_channels
        self.inter_channels = inter_channels
@@ -1058,41 +1009,38 @@ class SynthesizerTrnV3(nn.Module):
        self.gin_channels = gin_channels
        self.version = version

-        self.model_dim=512
+        self.model_dim = 512
        self.use_sdp = use_sdp
-        self.enc_p = TextEncoder(inter_channels,hidden_channels,filter_channels,n_heads,n_layers,kernel_size,p_dropout)
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
        # self.ref_enc = modules.MelStyleEncoder(spec_channels, style_vector_dim=gin_channels)###Rollback
-        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)###Rollback
+        self.ref_enc = modules.MelStyleEncoder(704, style_vector_dim=gin_channels)  ###Rollback
        # self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
        #                      upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
        # self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
        #                               gin_channels=gin_channels)
        # self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)

-
        ssl_dim = 768
-        assert semantic_frame_rate in ['25hz', "50hz"]
+        assert semantic_frame_rate in ["25hz", "50hz"]
        self.semantic_frame_rate = semantic_frame_rate
-        if semantic_frame_rate == '25hz':
+        if semantic_frame_rate == "25hz":
            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 2, stride=2)
        else:
            self.ssl_proj = nn.Conv1d(ssl_dim, ssl_dim, 1, stride=1)

-        self.quantizer = ResidualVectorQuantizer(
-            dimension=ssl_dim,
-            n_q=1,
-            bins=1024
-        )
+        self.quantizer = ResidualVectorQuantizer(dimension=ssl_dim, n_q=1, bins=1024)
        freeze_quantizer
-        inter_channels2=512
-        self.bridge=nn.Sequential(
-            nn.Conv1d(inter_channels, inter_channels2, 1, stride=1),
-            nn.LeakyReLU()
-        )
-        self.wns1=Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8,gin_channels=gin_channels)
-        self.linear_mel=nn.Conv1d(inter_channels2,100,1,stride=1)
-        self.cfm = CFM(100,DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),)#text_dim is condition feature dim
-        if freeze_quantizer==True:
+        inter_channels2 = 512
+        self.bridge = nn.Sequential(nn.Conv1d(inter_channels, inter_channels2, 1, stride=1), nn.LeakyReLU())
+        self.wns1 = Encoder(inter_channels2, inter_channels2, inter_channels2, 5, 1, 8, gin_channels=gin_channels)
+        self.linear_mel = nn.Conv1d(inter_channels2, 100, 1, stride=1)
+        self.cfm = CFM(
+            100,
+            DiT(**dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=inter_channels2, conv_layers=4)),
+        )  # text_dim is condition feature dim
+        if freeze_quantizer == True:
            set_no_grad(self.ssl_proj)
            set_no_grad(self.quantizer)
            set_no_grad(self.enc_p)
@@ -1100,24 +1048,23 @@ class SynthesizerTrnV3(nn.Module):
    def create_ge(self, refer):
        refer_lengths = compile_ref_length(refer)
        refer_mask = torch.unsqueeze(commons.sequence_mask(refer_lengths, refer.size(2)), 1).to(refer.dtype)
-        ge = self.ref_enc(refer[:,:704] * refer_mask, refer_mask)
+        ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
        return ge

-    def forward(self, codes, text,ge,speed=1):
+    def forward(self, codes, text, ge, speed=1):
+        y_lengths1 = compile_codes_length(codes)

-        y_lengths1=compile_codes_length(codes)
-        
        quantized = self.quantizer.decode(codes)
-        if self.semantic_frame_rate == '25hz':
-            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")##BCT
-        x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge,speed)
-        fea=self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")##BCT
+        if self.semantic_frame_rate == "25hz":
+            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
+        x, m_p, logs_p, y_mask = self.enc_p(quantized, text, ge, speed)
+        fea = self.bridge(x)
+        fea = F.interpolate(fea, scale_factor=1.875, mode="nearest")  ##BCT
        ####more wn paramter to learn mel
        fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
        return fea

    def extract_latent(self, x):
-        ssl =  self.ssl_proj(x)
+        ssl = self.ssl_proj(x)
        quantized, codes, commit_loss, quantized_list = self.quantizer(ssl)
-        return codes.transpose(0,1)
+        return codes.transpose(0, 1)
--- a/GPT_SoVITS/module/modules.py
+++ b/GPT_SoVITS/module/modules.py
@@ -52,11 +52,7 @@ class ConvReluNorm(nn.Module):

        self.conv_layers = nn.ModuleList()
        self.norm_layers = nn.ModuleList()
-        self.conv_layers.append(
-            nn.Conv1d(
-                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
-            )
-        )
+        self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
        self.norm_layers.append(LayerNorm(hidden_channels))
        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
        for _ in range(n_layers - 1):
@@ -156,9 +152,7 @@ class WN(torch.nn.Module):
        self.drop = nn.Dropout(p_dropout)

        if gin_channels != 0:
-            cond_layer = torch.nn.Conv1d(
-                gin_channels, 2 * hidden_channels * n_layers, 1
-            )
+            cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")

        for i in range(n_layers):
@@ -479,9 +473,7 @@ class ConvFlow(nn.Module):

        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
-        self.proj = nn.Conv1d(
-            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
-        )
+        self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
        self.proj.weight.data.zero_()
        self.proj.bias.data.zero_()

@@ -495,9 +487,7 @@ class ConvFlow(nn.Module):
        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]

        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
-            self.filter_channels
-        )
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels)
        unnormalized_derivatives = h[..., 2 * self.num_bins :]

        x1, logabsdet = piecewise_rational_quadratic_transform(
@@ -616,9 +606,7 @@ class MultiHeadAttention(nn.Module):
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)

-        self.attention = ScaledDotProductAttention(
-            temperature=np.power(d_model, 0.5), dropout=dropout
-        )
+        self.attention = ScaledDotProductAttention(temperature=np.power(d_model, 0.5), dropout=dropout)

        self.fc = nn.Linear(n_head * d_v, d_model)
        self.dropout = nn.Dropout(dropout)
@@ -649,9 +637,7 @@ class MultiHeadAttention(nn.Module):
        output, attn = self.attention(q, k, v, mask=slf_mask)

        output = output.view(n_head, sz_b, len_x, d_v)
-        output = (
-            output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1)
-        )  # b x lq x (n*dv)
+        output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_x, -1)  # b x lq x (n*dv)

        output = self.fc(output)

@@ -741,9 +727,7 @@ class MelStyleEncoder(nn.Module):
        if mask is not None:
            mask = (mask.int() == 0).squeeze(1)
        max_len = x.shape[1]
-        slf_attn_mask = (
-            mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None
-        )
+        slf_attn_mask = mask.unsqueeze(1).expand(-1, max_len, -1) if mask is not None else None

        # spectral
        x = self.spectral(x)
@@ -785,9 +769,7 @@ class MelStyleEncoderVAE(nn.Module):
        mu = self.fc1(enc_out)
        logvar = self.fc2(enc_out)
        posterior = D.Normal(mu, torch.exp(logvar))
-        kl_divergence = D.kl_divergence(
-            posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar))
-        )
+        kl_divergence = D.kl_divergence(posterior, D.Normal(torch.zeros_like(mu), torch.ones_like(logvar)))
        loss_kl = kl_divergence.mean()

        z = posterior.rsample()
@@ -825,9 +807,7 @@ class ActNorm(nn.Module):

    def forward(self, x, x_mask=None, g=None, reverse=False, **kwargs):
        if x_mask is None:
-            x_mask = torch.ones(x.size(0), 1, x.size(2)).to(
-                device=x.device, dtype=x.dtype
-            )
+            x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, dtype=x.dtype)
        x_len = torch.sum(x_mask, [1, 2])
        if not self.initialized:
            self.initialize(x, x_mask)
@@ -856,9 +836,7 @@ class ActNorm(nn.Module):
            v = m_sq - (m**2)
            logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6))

-            bias_init = (
-                (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype)
-            )
+            bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to(dtype=self.bias.dtype)
            logs_init = (-logs).view(*self.logs.shape).to(dtype=self.logs.dtype)

            self.bias.data.copy_(bias_init)
@@ -873,9 +851,7 @@ class InvConvNear(nn.Module):
        self.n_split = n_split
        self.no_jacobian = no_jacobian

-        w_init = torch.linalg.qr(
-            torch.FloatTensor(self.n_split, self.n_split).normal_()
-        )[0]
+        w_init = torch.linalg.qr(torch.FloatTensor(self.n_split, self.n_split).normal_())[0]
        if torch.det(w_init) < 0:
            w_init[:, 0] = -1 * w_init[:, 0]
        self.weight = nn.Parameter(w_init)
@@ -890,11 +866,7 @@ class InvConvNear(nn.Module):
            x_len = torch.sum(x_mask, [1, 2])

        x = x.view(b, 2, c // self.n_split, self.n_split // 2, t)
-        x = (
-            x.permute(0, 1, 3, 2, 4)
-            .contiguous()
-            .view(b, self.n_split, c // self.n_split, t)
-        )
+        x = x.permute(0, 1, 3, 2, 4).contiguous().view(b, self.n_split, c // self.n_split, t)

        if reverse:
            if hasattr(self, "weight_inv"):
--- a/GPT_SoVITS/module/mrte_model.py
+++ b/GPT_SoVITS/module/mrte_model.py
@@ -31,32 +31,15 @@ class MRTE(nn.Module):
        text_enc = self.text_pre(text * text_mask)
        if test != None:
            if test == 0:
-                x = (
-                    self.cross_attention(
-                        ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
-                    )
-                    + ssl_enc
-                    + ge
-                )
+                x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge
            elif test == 1:
                x = ssl_enc + ge
            elif test == 2:
-                x = (
-                    self.cross_attention(
-                        ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask
-                    )
-                    + ge
-                )
+                x = self.cross_attention(ssl_enc * 0 * ssl_mask, text_enc * text_mask, attn_mask) + ge
            else:
                raise ValueError("test should be 0,1,2")
        else:
-            x = (
-                self.cross_attention(
-                    ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
-                )
-                + ssl_enc
-                + ge
-            )
+            x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge
        x = self.c_post(x * ssl_mask)
        return x

@@ -70,9 +53,7 @@ class SpeakerEncoder(torch.nn.Module):
        model_embedding_size=256,
    ):
        super(SpeakerEncoder, self).__init__()
-        self.lstm = nn.LSTM(
-            mel_n_channels, model_hidden_size, model_num_layers, batch_first=True
-        )
+        self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
        self.linear = nn.Linear(model_hidden_size, model_embedding_size)
        self.relu = nn.ReLU()

--- a/GPT_SoVITS/module/quantize.py
+++ b/GPT_SoVITS/module/quantize.py
@@ -7,7 +7,6 @@
 """Residual vector quantizer implementation."""

 from dataclasses import dataclass, field
-import math
 import typing as tp

 import torch
@@ -88,14 +87,10 @@ class ResidualVectorQuantizer(nn.Module):
            raise ValueError(
                f"Last layer index in layers: A {max(layers)}. Number of quantizers in RVQ: B {self.n_q}. A must less than B."
            )
-        quantized, codes, commit_loss, quantized_list = self.vq(
-            x, n_q=n_q, layers=layers
-        )
+        quantized, codes, commit_loss, quantized_list = self.vq(x, n_q=n_q, layers=layers)
        return quantized, codes, torch.mean(commit_loss), quantized_list

-    def encode(
-        self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None
-    ) -> torch.Tensor:
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None, st: tp.Optional[int] = None) -> torch.Tensor:
        """Encode a given input tensor with the specified sample rate at the given bandwidth.
        The RVQ encode method sets the appropriate number of quantizer to use
        and returns indices for each quantizer.
--- a/GPT_SoVITS/module/transforms.py
+++ b/GPT_SoVITS/module/transforms.py
@@ -37,7 +37,7 @@ def piecewise_rational_quadratic_transform(
        min_bin_width=min_bin_width,
        min_bin_height=min_bin_height,
        min_derivative=min_derivative,
-        **spline_kwargs
+        **spline_kwargs,
    )
    return outputs, logabsdet

@@ -175,8 +175,7 @@ def rational_quadratic_spline(

        theta_one_minus_theta = root * (1 - root)
        denominator = input_delta + (
-            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-            * theta_one_minus_theta
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta
        )
        derivative_numerator = input_delta.pow(2) * (
            input_derivatives_plus_one * root.pow(2)
@@ -190,12 +189,9 @@ def rational_quadratic_spline(
        theta = (inputs - input_cumwidths) / input_bin_widths
        theta_one_minus_theta = theta * (1 - theta)

-        numerator = input_heights * (
-            input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
-        )
+        numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
        denominator = input_delta + (
-            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
-            * theta_one_minus_theta
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta
        )
        outputs = input_cumheights + numerator / denominator