Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
2025-04-07 09:42:47 +01:00
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions
--- a/GPT_SoVITS/module/attentions_onnx.py
+++ b/GPT_SoVITS/module/attentions_onnx.py
@@ -7,6 +7,7 @@ from module import commons

 from typing import Optional

+
 class LayerNorm(nn.Module):
    def __init__(self, channels, eps=1e-5):
        super().__init__()
@@ -43,7 +44,7 @@ class Encoder(nn.Module):
        p_dropout=0.0,
        window_size=4,
        isflow=True,
-        **kwargs
+        **kwargs,
    ):
        super().__init__()
        self.hidden_channels = hidden_channels
@@ -65,13 +66,9 @@ class Encoder(nn.Module):
            if self.gin_channels != 0:
                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
                # vits2 says 3rd block, so idx is 2 by default
-                self.cond_layer_idx = (
-                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
-                )
+                self.cond_layer_idx = kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
                logging.debug(self.gin_channels, self.cond_layer_idx)
-                assert (
-                    self.cond_layer_idx < self.n_layers
-                ), "cond_layer_idx should be less than n_layers"
+                assert self.cond_layer_idx < self.n_layers, "cond_layer_idx should be less than n_layers"
        self.drop = nn.Dropout(p_dropout)
        self.attn_layers = nn.ModuleList()
        self.norm_layers_1 = nn.ModuleList()
@@ -117,11 +114,13 @@ class Encoder(nn.Module):
    #         x = self.norm_layers_2[i](x + y)
    #     x = x * x_mask
    #     return x
-    
+
    def forward(self, x, x_mask):
        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
        x = x * x_mask
-        for attn_layers,norm_layers_1,ffn_layers,norm_layers_2 in zip(self.attn_layers,self.norm_layers_1,self.ffn_layers,self.norm_layers_2):
+        for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zip(
+            self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
+        ):
            y = attn_layers(x, x, attn_mask)
            y = self.drop(y)
            x = norm_layers_1(x + y)
@@ -170,14 +169,8 @@ class MultiHeadAttention(nn.Module):
        if window_size is not None:
            n_heads_rel = 1 if heads_share else n_heads
            rel_stddev = self.k_channels**-0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
-                * rel_stddev
-            )
+            self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)

        nn.init.xavier_uniform_(self.conv_q.weight)
        nn.init.xavier_uniform_(self.conv_k.weight)
@@ -187,7 +180,7 @@ class MultiHeadAttention(nn.Module):
                self.conv_k.weight.copy_(self.conv_q.weight)
                self.conv_k.bias.copy_(self.conv_q.bias)

-    def forward(self, x, c, attn_mask:Optional[torch.Tensor]=None):
+    def forward(self, x, c, attn_mask: Optional[torch.Tensor] = None):
        q = self.conv_q(x)
        k = self.conv_k(c)
        v = self.conv_v(c)
@@ -198,7 +191,7 @@ class MultiHeadAttention(nn.Module):
        x = self.conv_o(x)
        return x

-    def attention(self, query, key, value, mask:Optional[torch.Tensor]=None):
+    def attention(self, query, key, value, mask: Optional[torch.Tensor] = None):
        # reshape [b, d, t] -> [b, n_h, t, d_k]
        b, d, t_s, _ = (*key.size(), query.size(2))
        query = query.view(b, self.n_heads, self.k_channels, -1).transpose(2, 3)
@@ -223,8 +216,8 @@ class MultiHeadAttention(nn.Module):
            relative_weights = self._absolute_position_to_relative_position(p_attn)
            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-        
-        output = (output.transpose(2, 3).contiguous().view(b, d, -1))
+
+        output = output.transpose(2, 3).contiguous().view(b, d, -1)
        return output, p_attn

    def _matmul_with_relative_values(self, x, y):
@@ -248,19 +241,17 @@ class MultiHeadAttention(nn.Module):
    def _get_relative_embeddings(self, relative_embeddings, length):
        max_relative_position = 2 * self.window_size + 1
        # Pad first before slice to avoid using cond ops.
-        pad_l = torch.zeros((1), dtype = torch.int64) + length - (self.window_size + 1)
-        pad_s = torch.zeros((1), dtype = torch.int64) + (self.window_size + 1) - length
-        pad_length = torch.max(pad_l, other=torch.zeros((1), dtype = torch.int64))
-        slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype = torch.int64))
+        pad_l = torch.zeros((1), dtype=torch.int64) + length - (self.window_size + 1)
+        pad_s = torch.zeros((1), dtype=torch.int64) + (self.window_size + 1) - length
+        pad_length = torch.max(pad_l, other=torch.zeros((1), dtype=torch.int64))
+        slice_start_position = torch.max(pad_s, other=torch.zeros((1), dtype=torch.int64))

        slice_end_position = slice_start_position + 2 * length - 1
        padded_relative_embeddings = F.pad(
            relative_embeddings,
            commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
        )
-        used_relative_embeddings = padded_relative_embeddings[
-            :, slice_start_position:slice_end_position
-        ]
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
        return used_relative_embeddings

    def _relative_position_to_absolute_position(self, x):
@@ -274,14 +265,10 @@ class MultiHeadAttention(nn.Module):

        # Concat extra elements so to add up to shape (len+1, 2*len-1).
        x_flat = x.view([batch, heads, length * 2 * length])
-        x_flat = F.pad(
-            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
-        )
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))

        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
-            :, :, :length, length - 1 :
-        ]
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
        return x_final

    def _absolute_position_to_relative_position(self, x):
@@ -291,9 +278,7 @@ class MultiHeadAttention(nn.Module):
        """
        batch, heads, length, _ = x.size()
        # padd along column
-        x = F.pad(
-            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
-        )
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
        # add 0's in the beginning that will skew the elements after reshape
        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
@@ -351,7 +336,7 @@ class FFN(nn.Module):
        x = self.drop(x)
        x = self.conv_2(self.padding(x * x_mask))
        return x * x_mask
-    
+
    def padding(self, x):
        return self._same_padding(x)

@@ -395,12 +380,6 @@ class MRTE(nn.Module):

        ssl_enc = self.c_pre(ssl_enc * ssl_mask)
        text_enc = self.text_pre(text * text_mask)
-        x = (
-                self.cross_attention(
-                    ssl_enc * ssl_mask, text_enc * text_mask, attn_mask
-                )
-                + ssl_enc
-                + ge
-            )
+        x = self.cross_attention(ssl_enc * ssl_mask, text_enc * text_mask, attn_mask) + ssl_enc + ge
        x = self.c_post(x * ssl_mask)
        return x