Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
2025-04-07 09:42:47 +01:00
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions
--- a/GPT_SoVITS/AR/modules/activation.py
+++ b/GPT_SoVITS/AR/modules/activation.py
@@ -1,17 +1,14 @@
 # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
-from typing import Optional
-from typing import Tuple
+from typing import Optional, Tuple
+
 import torch
 from torch import Tensor
-from torch.nn import Linear
-from torch.nn import Module
-from torch.nn.init import constant_
-from torch.nn.init import xavier_normal_
-from torch.nn.init import xavier_uniform_
+from torch.nn import Linear, Module
+from torch.nn import functional as F
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
 from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
 from torch.nn.parameter import Parameter

-from torch.nn import functional as F
 from AR.modules.patched_mha_with_cache import multi_head_attention_forward_patched

 F.multi_head_attention_forward = multi_head_attention_forward_patched
@@ -73,6 +70,7 @@ class MultiheadAttention(Module):
        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)

    """
+
    __constants__ = ["batch_first"]
    bias_k: Optional[torch.Tensor]
    bias_v: Optional[torch.Tensor]
@@ -104,9 +102,7 @@ class MultiheadAttention(Module):
        self.dropout = dropout
        self.batch_first = batch_first
        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        if add_bias_kv:
            self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
@@ -117,31 +113,32 @@ class MultiheadAttention(Module):
        if linear1_cls == Linear:
            if not self._qkv_same_embed_dim:
                self.q_proj_weight = Parameter(
-                    torch.empty((embed_dim, embed_dim), **factory_kwargs)
+                    torch.empty((embed_dim, embed_dim), **factory_kwargs),
                )
                self.k_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.kdim), **factory_kwargs)
+                    torch.empty((embed_dim, self.kdim), **factory_kwargs),
                )
                self.v_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.vdim), **factory_kwargs)
+                    torch.empty((embed_dim, self.vdim), **factory_kwargs),
                )
                self.register_parameter("in_proj_weight", None)
            else:
                self.in_proj_weight = Parameter(
-                    torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
+                    torch.empty((3 * embed_dim, embed_dim), **factory_kwargs),
                )
                self.register_parameter("q_proj_weight", None)
                self.register_parameter("k_proj_weight", None)
                self.register_parameter("v_proj_weight", None)

            if bias:
-                self.in_proj_bias = Parameter(
-                    torch.empty(3 * embed_dim, **factory_kwargs)
-                )
+                self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
            else:
                self.register_parameter("in_proj_bias", None)
            self.out_proj = NonDynamicallyQuantizableLinear(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs
+                embed_dim,
+                embed_dim,
+                bias=bias,
+                **factory_kwargs,
            )

            self._reset_parameters()
@@ -150,7 +147,10 @@ class MultiheadAttention(Module):
                raise NotImplementedError
            else:
                self.in_proj_linear = linear1_cls(
-                    embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
+                    embed_dim,
+                    3 * embed_dim,
+                    bias=bias,
+                    **factory_kwargs,
                )
                self.in_proj_weight = self.in_proj_linear.weight

@@ -164,7 +164,10 @@ class MultiheadAttention(Module):
                    self.register_parameter("in_proj_bias", None)

            self.out_proj = linear2_cls(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs
+                embed_dim,
+                embed_dim,
+                bias=bias,
+                **factory_kwargs,
            )

            if self.bias_k is not None:
@@ -261,28 +264,26 @@ class MultiheadAttention(Module):
        if key_padding_mask is not None:
            _kpm_dtype = key_padding_mask.dtype
            if _kpm_dtype != torch.bool and not torch.is_floating_point(
-                key_padding_mask
+                key_padding_mask,
            ):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
+                raise AssertionError("only bool and floating types of key_padding_mask are supported")
        why_not_fast_path = ""
        if not is_batched:
-            why_not_fast_path = (
-                f"input not batched; expected query.dim() of 3 but got {query.dim()}"
-            )
+            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
        elif query is not key or key is not value:
            # When lifting this restriction, don't forget to either
            # enforce that the dtypes all match or test cases where
            # they don't!
            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
        elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
-        elif (
-            self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype
-        ):
+            why_not_fast_path = (
+                f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+            )
+        elif self.in_proj_weight is not None and query.dtype != self.in_proj_weight.dtype:
            # this case will fail anyway, but at least they'll get a useful error message.
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+            why_not_fast_path = (
+                f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+            )
        elif self.training:
            why_not_fast_path = "training is enabled"
        elif not self.batch_first:
@@ -300,9 +301,7 @@ class MultiheadAttention(Module):
        elif attn_mask is not None:
            why_not_fast_path = "attn_mask was not None"
        elif query.is_nested and key_padding_mask is not None:
-            why_not_fast_path = (
-                "key_padding_mask is not supported with NestedTensor input"
-            )
+            why_not_fast_path = "key_padding_mask is not supported with NestedTensor input"
        elif self.num_heads % 2 == 1:
            why_not_fast_path = "num_heads is odd"
        elif torch.is_autocast_enabled():
@@ -322,20 +321,10 @@ class MultiheadAttention(Module):
            # generator expressions.
            if torch.overrides.has_torch_function(tensor_args):
                why_not_fast_path = "some Tensor argument has_torch_function"
-            elif not all(
-                [
-                    (x is None or x.is_cuda or "cpu" in str(x.device))
-                    for x in tensor_args
-                ]
-            ):
+            elif not all([(x is None or x.is_cuda or "cpu" in str(x.device)) for x in tensor_args]):
                why_not_fast_path = "some Tensor argument is neither CUDA nor CPU"
-            elif torch.is_grad_enabled() and any(
-                [x is not None and x.requires_grad for x in tensor_args]
-            ):
-                why_not_fast_path = (
-                    "grad is enabled and at least one of query or the "
-                    "input/output projection weights or biases requires_grad"
-                )
+            elif torch.is_grad_enabled() and any([x is not None and x.requires_grad for x in tensor_args]):
+                why_not_fast_path = "grad is enabled and at least one of query or the input/output projection weights or biases requires_grad"
            if not why_not_fast_path:
                return torch._native_multi_head_attention(
                    query,
@@ -350,11 +339,7 @@ class MultiheadAttention(Module):
                    key_padding_mask if key_padding_mask is not None else attn_mask,
                    need_weights,
                    average_attn_weights,
-                    1
-                    if key_padding_mask is not None
-                    else 0
-                    if attn_mask is not None
-                    else None,
+                    1 if key_padding_mask is not None else 0 if attn_mask is not None else None,
                )

        any_nested = query.is_nested or key.is_nested or value.is_nested
--- a/GPT_SoVITS/AR/modules/activation_onnx.py
+++ b/GPT_SoVITS/AR/modules/activation_onnx.py
@@ -1,17 +1,13 @@
 # modified from https://github.com/lifeiteng/vall-e/blob/main/valle/modules/activation.py
-from typing import Optional
-from typing import Tuple
+from typing import Optional, Tuple
+
 import torch
 from torch import Tensor
-from torch.nn import Linear
-from torch.nn import Module
-from torch.nn.init import constant_
-from torch.nn.init import xavier_normal_
-from torch.nn.init import xavier_uniform_
+from torch.nn import Linear, Module
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
 from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
 from torch.nn.parameter import Parameter

-from torch.nn import functional as F
 from AR.modules.patched_mha_with_cache_onnx import multi_head_attention_forward_patched


@@ -47,9 +43,7 @@ class MultiheadAttention(Module):
        self.dropout = dropout
        self.batch_first = batch_first
        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"

        if add_bias_kv:
            self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
@@ -60,18 +54,30 @@ class MultiheadAttention(Module):
        if linear1_cls == Linear:
            if not self._qkv_same_embed_dim:
                self.q_proj_weight = Parameter(
-                    torch.empty((embed_dim, embed_dim), **factory_kwargs)
+                    torch.empty(
+                        (embed_dim, embed_dim),
+                        **factory_kwargs,
+                    )
                )
                self.k_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.kdim), **factory_kwargs)
+                    torch.empty(
+                        (embed_dim, self.kdim),
+                        **factory_kwargs,
+                    )
                )
                self.v_proj_weight = Parameter(
-                    torch.empty((embed_dim, self.vdim), **factory_kwargs)
+                    torch.empty(
+                        (embed_dim, self.vdim),
+                        **factory_kwargs,
+                    )
                )
                self.register_parameter("in_proj_weight", None)
            else:
                self.in_proj_weight = Parameter(
-                    torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
+                    torch.empty(
+                        (3 * embed_dim, embed_dim),
+                        **factory_kwargs,
+                    )
                )
                self.register_parameter("q_proj_weight", None)
                self.register_parameter("k_proj_weight", None)
@@ -79,13 +85,11 @@ class MultiheadAttention(Module):

            if bias:
                self.in_proj_bias = Parameter(
-                    torch.empty(3 * embed_dim, **factory_kwargs)
+                    torch.empty(3 * embed_dim, **factory_kwargs),
                )
            else:
                self.register_parameter("in_proj_bias", None)
-            self.out_proj = NonDynamicallyQuantizableLinear(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs
-            )
+            self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)

            self._reset_parameters()
        else:
@@ -93,7 +97,10 @@ class MultiheadAttention(Module):
                raise NotImplementedError
            else:
                self.in_proj_linear = linear1_cls(
-                    embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
+                    embed_dim,
+                    3 * embed_dim,
+                    bias=bias,
+                    **factory_kwargs,
                )
                self.in_proj_weight = self.in_proj_linear.weight

@@ -107,7 +114,10 @@ class MultiheadAttention(Module):
                    self.register_parameter("in_proj_bias", None)

            self.out_proj = linear2_cls(
-                embed_dim, embed_dim, bias=bias, **factory_kwargs
+                embed_dim,
+                embed_dim,
+                bias=bias,
+                **factory_kwargs,
            )

            if self.bias_k is not None:
--- a/GPT_SoVITS/AR/modules/embedding.py
+++ b/GPT_SoVITS/AR/modules/embedding.py
@@ -60,14 +60,11 @@ class SinePositionalEmbedding(nn.Module):
                return
        pe = torch.zeros(x.size(1), self.embedding_dim)
        if self.reverse:
-            position = torch.arange(
-                x.size(1) - 1, -1, -1.0, dtype=torch.float32
-            ).unsqueeze(1)
+            position = torch.arange(x.size(1) - 1, -1, -1.0, dtype=torch.float32).unsqueeze(1)
        else:
            position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(
-            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32)
-            * -(math.log(10000.0) / self.embedding_dim)
+            torch.arange(0, self.embedding_dim, 2, dtype=torch.float32) * -(math.log(10000.0) / self.embedding_dim)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
--- a/GPT_SoVITS/AR/modules/embedding_onnx.py
+++ b/GPT_SoVITS/AR/modules/embedding_onnx.py
@@ -50,7 +50,7 @@ class SinePositionalEmbedding(nn.Module):
        self.div_term = torch.exp(torch.arange(0, self.embedding_dim, 2) * -(math.log(10000.0) / self.embedding_dim))

    def extend_pe(self, x):
-        position = torch.cumsum(torch.ones_like(x[:,:,0]), dim=1).transpose(0, 1)
+        position = torch.cumsum(torch.ones_like(x[:, :, 0]), dim=1).transpose(0, 1)
        scpe = (position * self.div_term).unsqueeze(0)
        pe = torch.cat([torch.sin(scpe), torch.cos(scpe)]).permute(1, 2, 0)
        pe = pe.contiguous().view(1, -1, self.embedding_dim)
--- a/GPT_SoVITS/AR/modules/lr_schedulers.py
+++ b/GPT_SoVITS/AR/modules/lr_schedulers.py
@@ -49,13 +49,9 @@ class WarmupCosineLRSchedule(torch.optim.lr_scheduler._LRScheduler):
            lr = self.end_lr

        else:
-            decay_ratio = (self._current_step - self.warmup_steps) / (
-                self.total_steps - self.warmup_steps
-            )
+            decay_ratio = (self._current_step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
            if decay_ratio < 0.0 or decay_ratio > 1.0:
-                raise RuntimeError(
-                    "Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings."
-                )
+                raise RuntimeError("Decay ratio must be in [0.0, 1.0]. Fix LR scheduler settings.")
            coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
            lr = self.end_lr + coeff * (self.peak_lr - self.end_lr)

@@ -70,7 +66,13 @@ if __name__ == "__main__":
    m = nn.Linear(10, 10)
    opt = Adam(m.parameters(), lr=1e-4)
    s = WarmupCosineLRSchedule(
-        opt, 1e-6, 2e-4, 1e-6, warmup_steps=2000, total_steps=20000, current_step=0
+        opt,
+        1e-6,
+        2e-4,
+        1e-6,
+        warmup_steps=2000,
+        total_steps=20000,
+        current_step=0,
    )
    lrs = []
    for i in range(25000):
--- a/GPT_SoVITS/AR/modules/optim.py
+++ b/GPT_SoVITS/AR/modules/optim.py
@@ -16,8 +16,7 @@
 import contextlib
 import logging
 from collections import defaultdict
-from typing import List
-from typing import Tuple
+from typing import List, Tuple

 import torch
 from torch import Tensor
@@ -71,12 +70,8 @@ class BatchedOptimizer(Optimizer):
          group_params_names: name for each parameter in group,
                which is List[str].
        """
-        batches = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
-        batches_names = defaultdict(
-            list
-        )  # `batches` maps from tuple (dtype_as_str,*shape) to list of str
+        batches = defaultdict(list)  # `batches` maps from tuple (dtype_as_str,*shape) to list of nn.Parameter
+        batches_names = defaultdict(list)  # `batches` maps from tuple (dtype_as_str,*shape) to list of str

        assert len(param_group) == len(group_params_names)
        for p, named_p in zip(param_group, group_params_names):
@@ -85,11 +80,8 @@ class BatchedOptimizer(Optimizer):
            batches_names[key].append(named_p)

        batches_names_keys = list(batches_names.keys())
-        sorted_idx = sorted(
-            range(len(batches_names)), key=lambda i: batches_names_keys[i])
-        batches_names = [
-            batches_names[batches_names_keys[idx]] for idx in sorted_idx
-        ]
+        sorted_idx = sorted(range(len(batches_names)), key=lambda i: batches_names_keys[i])
+        batches_names = [batches_names[batches_names_keys[idx]] for idx in sorted_idx]
        batches = [batches[batches_names_keys[idx]] for idx in sorted_idx]

        stacked_params_dict = dict()
@@ -106,16 +98,14 @@ class BatchedOptimizer(Optimizer):
            # group.  class Optimizer will take care of saving/loading state.
            state = self.state[p]
            p_stacked = torch.stack(batch)
-            grad = torch.stack([
-                torch.zeros_like(p) if p.grad is None else p.grad for p in batch
-            ])
+            grad = torch.stack([torch.zeros_like(p) if p.grad is None else p.grad for p in batch])
            p_stacked.grad = grad
            stacked_params_dict[key] = p_stacked
            tuples.append((p_stacked, state, batch_names))

        yield tuples  # <-- calling code will do the actual optimization here!

-        for ((stacked_params, _state, _names), batch) in zip(tuples, batches):
+        for (stacked_params, _state, _names), batch in zip(tuples, batches):
            for i, p in enumerate(batch):  # batch is list of Parameter
                p.copy_(stacked_params[i])

@@ -164,25 +154,24 @@ class ScaledAdam(BatchedOptimizer):
    """

    def __init__(
-            self,
-            params,
-            lr=3e-02,
-            clipping_scale=None,
-            betas=(0.9, 0.98),
-            scalar_lr_scale=0.1,
-            eps=1.0e-08,
-            param_min_rms=1.0e-05,
-            param_max_rms=3.0,
-            scalar_max=10.0,
-            size_update_period=4,
-            clipping_update_period=100,
-            parameters_names=None,
-            show_dominant_parameters=True, ):
-
+        self,
+        params,
+        lr=3e-02,
+        clipping_scale=None,
+        betas=(0.9, 0.98),
+        scalar_lr_scale=0.1,
+        eps=1.0e-08,
+        param_min_rms=1.0e-05,
+        param_max_rms=3.0,
+        scalar_max=10.0,
+        size_update_period=4,
+        clipping_update_period=100,
+        parameters_names=None,
+        show_dominant_parameters=True,
+    ):
        assert parameters_names is not None, (
-            "Please prepare parameters_names,"
-            "which is a List[List[str]]. Each List[str] is for a group"
-            "and each str is for a parameter")
+            "Please prepare parameters_names,which is a List[List[str]]. Each List[str] is for a groupand each str is for a parameter"
+        )
        defaults = dict(
            lr=lr,
            clipping_scale=clipping_scale,
@@ -193,7 +182,8 @@ class ScaledAdam(BatchedOptimizer):
            param_max_rms=param_max_rms,
            scalar_max=scalar_max,
            size_update_period=size_update_period,
-            clipping_update_period=clipping_update_period, )
+            clipping_update_period=clipping_update_period,
+        )

        super(ScaledAdam, self).__init__(params, defaults)
        assert len(self.param_groups) == len(parameters_names)
@@ -218,18 +208,13 @@ class ScaledAdam(BatchedOptimizer):

        batch = True

-        for group, group_params_names in zip(self.param_groups,
-                                             self.parameters_names):
-
-            with self.batched_params(group["params"],
-                                     group_params_names) as batches:
-
+        for group, group_params_names in zip(self.param_groups, self.parameters_names):
+            with self.batched_params(group["params"], group_params_names) as batches:
                # batches is list of pairs (stacked_param, state).  stacked_param is like
                # a regular parameter, and will have a .grad, but the 1st dim corresponds to
                # a stacking dim, it is not a real dim.

-                if (len(batches[0][1]) ==
-                        0):  # if len(first state) == 0: not yet initialized
+                if len(batches[0][1]) == 0:  # if len(first state) == 0: not yet initialized
                    clipping_scale = 1
                else:
                    clipping_scale = self._get_clipping_scale(group, batches)
@@ -239,9 +224,7 @@ class ScaledAdam(BatchedOptimizer):
                    # grad is not going to be None, we handled that when creating the batches.
                    grad = p.grad
                    if grad.is_sparse:
-                        raise RuntimeError(
-                            "ScaledAdam optimizer does not support sparse gradients"
-                        )
+                        raise RuntimeError("ScaledAdam optimizer does not support sparse gradients")
                    # State initialization
                    if len(state) == 0:
                        self._init_state(group, p, state)
@@ -274,8 +257,7 @@ class ScaledAdam(BatchedOptimizer):
        # parameter-change "delta", which combines all forms of
        # update.  this is equivalent to how it's done in Adam,
        # except for the first few steps.
-        state["delta"] = torch.zeros_like(
-            p, memory_format=torch.preserve_format)
+        state["delta"] = torch.zeros_like(p, memory_format=torch.preserve_format)

        batch_size = p.shape[0]
        numel = p.numel() // batch_size
@@ -285,22 +267,16 @@ class ScaledAdam(BatchedOptimizer):
            # "param_rms" just periodically records the scalar root-mean-square value of
            # the parameter tensor.
            # it has a shape like (batch_size, 1, 1, 1, 1)
-            param_rms = (
-                (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt())
+            param_rms = (p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt()
            state["param_rms"] = param_rms

            state["scale_exp_avg_sq"] = torch.zeros_like(param_rms)
-            state["scale_grads"] = torch.zeros(size_update_period,
-                                               *param_rms.shape, **kwargs)
+            state["scale_grads"] = torch.zeros(size_update_period, *param_rms.shape, **kwargs)

        # exp_avg_sq is the weighted sum of scaled gradients. as in Adam.
-        state["exp_avg_sq"] = torch.zeros_like(
-            p, memory_format=torch.preserve_format)
+        state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)

-    def _get_clipping_scale(self,
-                            group: dict,
-                            tuples: List[Tuple[Tensor, dict, List[str]]]
-                            ) -> float:
+    def _get_clipping_scale(self, group: dict, tuples: List[Tuple[Tensor, dict, List[str]]]) -> float:
        """
        Returns a scalar factor <= 1.0 that dictates gradient clipping, i.e. we will scale the gradients
        by this amount before applying the rest of the update.
@@ -325,20 +301,18 @@ class ScaledAdam(BatchedOptimizer):
        clipping_update_period = group["clipping_update_period"]

        tot_sumsq = torch.tensor(0.0, device=first_p.device)
-        for (p, state, param_names) in tuples:
+        for p, state, param_names in tuples:
            grad = p.grad
            if grad.is_sparse:
-                raise RuntimeError(
-                    "ScaledAdam optimizer does not support sparse gradients")
+                raise RuntimeError("ScaledAdam optimizer does not support sparse gradients")
            if p.numel() == p.shape[0]:  # a batch of scalars
                tot_sumsq += (grad**2).sum()  # sum() to change shape [1] to []
            else:
-                tot_sumsq += ((grad * state["param_rms"])**2).sum()
+                tot_sumsq += ((grad * state["param_rms"]) ** 2).sum()

        tot_norm = tot_sumsq.sqrt()
        if "model_norms" not in first_state:
-            first_state["model_norms"] = torch.zeros(
-                clipping_update_period, device=p.device)
+            first_state["model_norms"] = torch.zeros(clipping_update_period, device=p.device)
        first_state["model_norms"][step % clipping_update_period] = tot_norm

        if step % clipping_update_period == 0:
@@ -350,20 +324,20 @@ class ScaledAdam(BatchedOptimizer):
            for n in range(0, 5):
                index = min(
                    clipping_update_period - 1,
-                    (clipping_update_period // 4) * n, )
+                    (clipping_update_period // 4) * n,
+                )
                quartiles.append(sorted_norms[index].item())

            median = quartiles[2]
            threshold = clipping_scale * median
            first_state["model_norm_threshold"] = threshold
-            percent_clipped = (first_state["num_clipped"] * 100.0 /
-                               clipping_update_period
-                               if "num_clipped" in first_state else 0.0)
+            percent_clipped = (
+                first_state["num_clipped"] * 100.0 / clipping_update_period if "num_clipped" in first_state else 0.0
+            )
            first_state["num_clipped"] = 0
            quartiles = " ".join(["%.3e" % x for x in quartiles])
            logging.info(
-                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, "
-                f"threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
+                f"Clipping_scale={clipping_scale}, grad-norm quartiles {quartiles}, threshold={threshold:.3e}, percent-clipped={percent_clipped:.1f}"
            )

        if step < clipping_update_period:
@@ -373,25 +347,20 @@ class ScaledAdam(BatchedOptimizer):
                model_norm_threshold = first_state["model_norm_threshold"]
            except KeyError:
                logging.info(
-                    "Warning: model_norm_threshold not in state: possibly "
-                    "you changed config when restarting, adding clipping_scale option?"
+                    "Warning: model_norm_threshold not in state: possibly you changed config when restarting, adding clipping_scale option?"
                )
                return 1.0
            ans = min(1.0, (model_norm_threshold / (tot_norm + 1.0e-20)).item())
            if ans < 1.0:
                first_state["num_clipped"] += 1
            if ans < 0.1:
-                logging.warn(
-                    f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}"
-                )
+                logging.warn(f"Scaling gradients by {ans}, model_norm_threshold={model_norm_threshold}")
                if self.show_dominant_parameters:
                    assert p.shape[0] == len(param_names)
                    self._show_gradient_dominating_parameter(tuples, tot_sumsq)
            return ans

-    def _show_gradient_dominating_parameter(
-            self, tuples: List[Tuple[Tensor, dict, List[str]]],
-            tot_sumsq: Tensor):
+    def _show_gradient_dominating_parameter(self, tuples: List[Tuple[Tensor, dict, List[str]]], tot_sumsq: Tensor):
        """
        Show information of parameter wihch dominanting tot_sumsq.

@@ -406,7 +375,7 @@ class ScaledAdam(BatchedOptimizer):
                from tuples, we still pass it to save some time.
        """
        all_sumsq_orig = {}
-        for (p, state, batch_param_names) in tuples:
+        for p, state, batch_param_names in tuples:
            # p is a stacked batch parameters.
            batch_grad = p.grad
            if p.numel() == p.shape[0]:  # a batch of scalars
@@ -415,41 +384,46 @@ class ScaledAdam(BatchedOptimizer):
                batch_rms_orig = torch.ones(p.shape[0])
            else:
                batch_rms_orig = state["param_rms"]
-                batch_sumsq_orig = ((batch_grad * batch_rms_orig)**2).sum(
-                    dim=list(range(1, batch_grad.ndim)))
-
-            for name, sumsq_orig, rms, grad in zip(batch_param_names,
-                                                   batch_sumsq_orig,
-                                                   batch_rms_orig, batch_grad):
+                batch_sumsq_orig = ((batch_grad * batch_rms_orig) ** 2).sum(dim=list(range(1, batch_grad.ndim)))

+            for name, sumsq_orig, rms, grad in zip(
+                batch_param_names,
+                batch_sumsq_orig,
+                batch_rms_orig,
+                batch_grad,
+            ):
                proportion_orig = sumsq_orig / tot_sumsq
                all_sumsq_orig[name] = (proportion_orig, sumsq_orig, rms, grad)

        assert torch.isclose(
            sum([value[0] for value in all_sumsq_orig.values()]).cpu(),
-            torch.tensor(1.0), )
+            torch.tensor(1.0),
+        )
        sorted_by_proportion = {
            k: v
            for k, v in sorted(
                all_sumsq_orig.items(),
                key=lambda item: item[1][0],
-                reverse=True, )
+                reverse=True,
+            )
        }
        dominant_param_name = next(iter(sorted_by_proportion))
-        (dominant_proportion, dominant_sumsq, dominant_rms,
-         dominant_grad, ) = sorted_by_proportion[dominant_param_name]
-        logging.info(f"Parameter Dominanting tot_sumsq {dominant_param_name}"
-                     f" with proportion {dominant_proportion:.2f},"
-                     f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
-                     f"={dominant_sumsq:.3e},"
-                     f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
-                     f" orig_rms_sq={(dominant_rms**2).item():.3e}")
+        (
+            dominant_proportion,
+            dominant_sumsq,
+            dominant_rms,
+            dominant_grad,
+        ) = sorted_by_proportion[dominant_param_name]
+        logging.info(
+            f"Parameter Dominanting tot_sumsq {dominant_param_name}"
+            f" with proportion {dominant_proportion:.2f},"
+            f" where dominant_sumsq=(grad_sumsq*orig_rms_sq)"
+            f"={dominant_sumsq:.3e},"
+            f" grad_sumsq = {(dominant_grad**2).sum():.3e},"
+            f" orig_rms_sq={(dominant_rms**2).item():.3e}"
+        )

-    def _step_one_batch(self,
-                        group: dict,
-                        p: Tensor,
-                        state: dict,
-                        clipping_scale: float):
+    def _step_one_batch(self, group: dict, p: Tensor, state: dict, clipping_scale: float):
        """
        Do the step for one parameter, which is actually going to be a batch of
        `real` parameters, with dim 0 as the batch dim.
@@ -475,13 +449,10 @@ class ScaledAdam(BatchedOptimizer):
        if numel > 1:
            # Update the size/scale of p, and set param_rms
            scale_grads = state["scale_grads"]
-            scale_grads[step % size_update_period] = (p * grad).sum(
-                dim=list(range(1, p.ndim)), keepdim=True)
+            scale_grads[step % size_update_period] = (p * grad).sum(dim=list(range(1, p.ndim)), keepdim=True)
            if step % size_update_period == size_update_period - 1:
                param_rms = state["param_rms"]  # shape: (batch_size, 1, 1, ..)
-                param_rms.copy_((p**2)
-                                .mean(dim=list(range(1, p.ndim)), keepdim=True)
-                                .sqrt())
+                param_rms.copy_((p**2).mean(dim=list(range(1, p.ndim)), keepdim=True).sqrt())
                if step > 0:
                    # self._size_update() learns the overall scale on the
                    # parameter, by shrinking or expanding it.
@@ -496,11 +467,13 @@ class ScaledAdam(BatchedOptimizer):

        state["step"] = step + 1

-    def _size_update(self,
-                     group: dict,
-                     scale_grads: Tensor,
-                     p: Tensor,
-                     state: dict) -> None:
+    def _size_update(
+        self,
+        group: dict,
+        scale_grads: Tensor,
+        p: Tensor,
+        state: dict,
+    ) -> None:
        """
               Called only where p.numel() > 1, this updates the scale of the parameter.
               If we imagine: p =  underlying_param * scale.exp(), and we are doing
@@ -529,11 +502,11 @@ class ScaledAdam(BatchedOptimizer):
        # faster decay at this level.
        beta2_corr = beta2**size_update_period

-        scale_exp_avg_sq = state[
-            "scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
+        scale_exp_avg_sq = state["scale_exp_avg_sq"]  # shape: (batch_size, 1, 1, ..)
        scale_exp_avg_sq.mul_(beta2_corr).add_(
            (scale_grads**2).mean(dim=0),  # mean over dim `size_update_period`
-            alpha=1 - beta2_corr, )  # shape is (batch_size, 1, 1, ...)
+            alpha=1 - beta2_corr,
+        )  # shape is (batch_size, 1, 1, ...)

        # The 1st time we reach here is when size_step == 1.
        size_step = (step + 1) // size_update_period
@@ -543,8 +516,7 @@ class ScaledAdam(BatchedOptimizer):

        denom = scale_exp_avg_sq.sqrt() + eps

-        scale_step = (-size_lr * (bias_correction2**0.5) *
-                      scale_grads.sum(dim=0) / denom)
+        scale_step = -size_lr * (bias_correction2**0.5) * scale_grads.sum(dim=0) / denom

        is_too_small = param_rms < param_min_rms
        is_too_large = param_rms > param_max_rms
@@ -580,9 +552,8 @@ class ScaledAdam(BatchedOptimizer):
        exp_avg_sq = state["exp_avg_sq"]
        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))

-        this_step = state["step"] - (state["zero_step"]
-                                     if "zero_step" in state else 0)
-        bias_correction2 = 1 - beta2**(this_step + 1)
+        this_step = state["step"] - (state["zero_step"] if "zero_step" in state else 0)
+        bias_correction2 = 1 - beta2 ** (this_step + 1)
        if bias_correction2 < 0.99:
            # note: not in-place.
            exp_avg_sq = exp_avg_sq * (1.0 / bias_correction2)
@@ -613,7 +584,7 @@ class ScaledAdam(BatchedOptimizer):

        # bias_correction2 is like in Adam.  Don't bother with bias_correction1;
        # slower update at the start will help stability anyway.
-        bias_correction2 = 1 - beta2**(state["step"] + 1)
+        bias_correction2 = 1 - beta2 ** (state["step"] + 1)
        denom = (exp_avg_sq / bias_correction2).sqrt() + eps

        delta = state["delta"]
--- a/GPT_SoVITS/AR/modules/patched_mha_with_cache.py
+++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache.py
@@ -5,7 +5,6 @@ from torch.nn.functional import (
    _none_or_dtype,
    _in_projection_packed,
 )
-from torch.nn import functional as F
 import torch
 # Tensor = torch.Tensor
 # from typing import Callable, List, Optional, Tuple, Union
@@ -25,18 +24,18 @@ def multi_head_attention_forward_patched(
    dropout_p: float,
    out_proj_weight,
    out_proj_bias,
-    training = True,
-    key_padding_mask = None,
-    need_weights = True,
-    attn_mask = None,
-    use_separate_proj_weight = False,
-    q_proj_weight = None,
-    k_proj_weight = None,
-    v_proj_weight = None,
-    static_k = None,
-    static_v = None,
-    average_attn_weights = True,
-    is_causal = False,
+    training=True,
+    key_padding_mask=None,
+    need_weights=True,
+    attn_mask=None,
+    use_separate_proj_weight=False,
+    q_proj_weight=None,
+    k_proj_weight=None,
+    v_proj_weight=None,
+    static_k=None,
+    static_v=None,
+    average_attn_weights=True,
+    is_causal=False,
    cache=None,
 ):
    r"""
@@ -156,9 +155,7 @@ def multi_head_attention_forward_patched(
            cache=cache,
        )

-    is_batched = _mha_shape_check(
-        query, key, value, key_padding_mask, attn_mask, num_heads
-    )
+    is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)

    # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
    # is batched, run the computation and before returning squeeze the
@@ -211,45 +208,33 @@ def multi_head_attention_forward_patched(
            # longer causal.
            is_causal = False

-    assert (
-        embed_dim == embed_dim_to_check
-    ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+    assert embed_dim == embed_dim_to_check, (
+        f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+    )
    if isinstance(embed_dim, torch.Tensor):
        # embed_dim can be a tensor when JIT tracing
        head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
    else:
        head_dim = embed_dim // num_heads
-    assert (
-        head_dim * num_heads == embed_dim
-    ), f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
    if use_separate_proj_weight:
        # allow MHA to have different embedding dimensions when separate projection weights are used
-        assert (
-            key.shape[:2] == value.shape[:2]
-        ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
+        assert key.shape[:2] == value.shape[:2], (
+            f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
+        )
    else:
-        assert (
-            key.shape == value.shape
-        ), f"key shape {key.shape} does not match value shape {value.shape}"
+        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"

    #
    # compute in-projection
    #
    if not use_separate_proj_weight:
-        assert (
-            in_proj_weight is not None
-        ), "use_separate_proj_weight is False but in_proj_weight is None"
+        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
    else:
-        assert (
-            q_proj_weight is not None
-        ), "use_separate_proj_weight is True but q_proj_weight is None"
-        assert (
-            k_proj_weight is not None
-        ), "use_separate_proj_weight is True but k_proj_weight is None"
-        assert (
-            v_proj_weight is not None
-        ), "use_separate_proj_weight is True but v_proj_weight is None"
+        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
+        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
+        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
        if in_proj_bias is None:
            b_q = b_k = b_v = None
        else:
@@ -312,9 +297,7 @@ def multi_head_attention_forward_patched(
                    f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
                )
        else:
-            raise RuntimeError(
-                f"attn_mask's dimension {attn_mask.dim()} is not supported"
-            )
+            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")

    # add bias along batch dimension (currently second)
    if bias_k is not None and bias_v is not None:
@@ -338,34 +321,26 @@ def multi_head_attention_forward_patched(
        k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
    else:
        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
-        assert (
-            static_k.size(0) == bsz * num_heads
-        ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
-        assert (
-            static_k.size(2) == head_dim
-        ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+        assert static_k.size(0) == bsz * num_heads, (
+            f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
+        )
+        assert static_k.size(2) == head_dim, f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
        k = static_k
    if static_v is None:
        v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
    else:
        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
-        assert (
-            static_v.size(0) == bsz * num_heads
-        ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
-        assert (
-            static_v.size(2) == head_dim
-        ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+        assert static_v.size(0) == bsz * num_heads, (
+            f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
+        )
+        assert static_v.size(2) == head_dim, f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
        v = static_v

    # add zero attention along batch dimension (now first)
    if add_zero_attn:
        zero_attn_shape = (bsz * num_heads, 1, head_dim)
-        k = torch.cat(
-            [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1
-        )
-        v = torch.cat(
-            [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1
-        )
+        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
        if attn_mask is not None:
            attn_mask = pad(attn_mask, (0, 1))
        if key_padding_mask is not None:
@@ -381,9 +356,7 @@ def multi_head_attention_forward_patched(
            src_len,
        ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
        key_padding_mask = (
-            key_padding_mask.view(bsz, 1, 1, src_len)
-            .expand(-1, num_heads, -1, -1)
-            .reshape(bsz * num_heads, 1, src_len)
+            key_padding_mask.view(bsz, 1, 1, src_len).expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
        )
        if attn_mask is None:
            attn_mask = key_padding_mask
@@ -402,14 +375,10 @@ def multi_head_attention_forward_patched(
        B, Nt, E = q.shape
        q_scaled = q / math.sqrt(E)

-        assert not (
-            is_causal and attn_mask is None
-        ), "FIXME: is_causal not implemented for need_weights"
+        assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"

        if attn_mask is not None:
-            attn_output_weights = torch.baddbmm(
-                attn_mask, q_scaled, k.transpose(-2, -1)
-            )
+            attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
        else:
            attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
        attn_output_weights = softmax(attn_output_weights, dim=-1)
@@ -418,9 +387,7 @@ def multi_head_attention_forward_patched(

        attn_output = torch.bmm(attn_output_weights, v)

-        attn_output = (
-            attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
-        )
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))

@@ -449,13 +416,9 @@ def multi_head_attention_forward_patched(
        v = v.view(bsz, num_heads, src_len, head_dim)

        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
-        attn_output = scaled_dot_product_attention(
-            q, k, v, attn_mask, dropout_p, is_causal
-        )
+        attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)

-        attn_output = (
-            attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
-        )
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)

        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
--- a/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
+++ b/GPT_SoVITS/AR/modules/patched_mha_with_cache_onnx.py
@@ -1,11 +1,9 @@
 from torch.nn.functional import *
 from torch.nn.functional import (
-    _mha_shape_check,
    _canonical_mask,
-    _none_or_dtype,
-    _in_projection_packed,
 )

+
 def multi_head_attention_forward_patched(
    query,
    key,
@@ -34,7 +32,6 @@ def multi_head_attention_forward_patched(
    is_causal: bool = False,
    cache=None,
 ) -> Tuple[Tensor, Optional[Tensor]]:
-
    # set up shape vars
    _, _, embed_dim = query.shape
    attn_mask = _canonical_mask(
@@ -80,12 +77,8 @@ def multi_head_attention_forward_patched(
    q = q.view(num_heads, -1, head_dim).unsqueeze(0)
    k = k.view(num_heads, -1, head_dim).unsqueeze(0)
    v = v.view(num_heads, -1, head_dim).unsqueeze(0)
-    attn_output = scaled_dot_product_attention(
-        q, k, v, attn_mask, dropout_p, is_causal
-    )
-    attn_output = (
-        attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
-    )
+    attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+    attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(-1, embed_dim)
    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
    attn_output = attn_output.view(-1, 1, attn_output.size(1))

--- a/GPT_SoVITS/AR/modules/scaling.py
+++ b/GPT_SoVITS/AR/modules/scaling.py
@@ -13,12 +13,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
-import math
 import random
 from typing import Optional
 from typing import Tuple
-from typing import Union

 import torch
 import torch.nn as nn
@@ -61,9 +58,7 @@ class DoubleSwishFunction(torch.autograd.Function):
            # floors), should be expectation-preserving.
            floor = -0.043637
            ceil = 1.2
-            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(
-                deriv
-            )
+            d_scaled = (deriv - floor) * (255.0 / (ceil - floor)) + torch.rand_like(deriv)
            if __name__ == "__main__":
                # for self-testing only.
                assert d_scaled.min() >= 0.0
@@ -153,13 +148,9 @@ def _compute_scale_factor(
    else:
        # below_threshold is 0 if x_abs_mean > min_abs, can be at most max_factor if
        # x_abs)_mean , min_abs.
-        below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(
-            min=0, max=max_factor
-        )
+        below_threshold = ((min_abs - x_abs_mean) * (gain_factor / min_abs)).clamp(min=0, max=max_factor)

-    above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(
-        min=0, max=max_factor
-    )
+    above_threshold = ((x_abs_mean - max_abs) * (gain_factor / max_abs)).clamp(min=0, max=max_factor)

    return below_threshold - above_threshold

@@ -181,18 +172,16 @@ def _compute_sign_factor(
    else:
        # 0 if proportion_positive >= min_positive, else can be
        # as large as max_factor.
-        factor1 = (
-            (min_positive - proportion_positive) * (gain_factor / min_positive)
-        ).clamp_(min=0, max=max_factor)
+        factor1 = ((min_positive - proportion_positive) * (gain_factor / min_positive)).clamp_(min=0, max=max_factor)

    if max_positive == 1.0:
        factor2 = 0.0
    else:
        # 0 if self.proportion_positive <= max_positive, else can be
        # as large as -max_factor.
-        factor2 = (
-            (proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive))
-        ).clamp_(min=0, max=max_factor)
+        factor2 = ((proportion_positive - max_positive) * (gain_factor / (1.0 - max_positive))).clamp_(
+            min=0, max=max_factor
+        )
    sign_factor = factor1 - factor2
    # require min_positive != 0 or max_positive != 1:
    assert not isinstance(sign_factor, float)
@@ -320,15 +309,11 @@ class ActivationBalancer(torch.nn.Module):
            return _no_op(x)


-def BalancedDoubleSwish(
-    d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25
-) -> nn.Sequential:
+def BalancedDoubleSwish(d_model, channel_dim=-1, max_abs=10.0, min_prob=0.25) -> nn.Sequential:
    """
    ActivationBalancer -> DoubleSwish
    """
-    balancer = ActivationBalancer(
-        d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob
-    )
+    balancer = ActivationBalancer(d_model, channel_dim=channel_dim, max_abs=max_abs, min_prob=min_prob)
    return nn.Sequential(
        balancer,
        DoubleSwish(),
--- a/GPT_SoVITS/AR/modules/transformer.py
+++ b/GPT_SoVITS/AR/modules/transformer.py
@@ -42,12 +42,8 @@ class LayerNorm(nn.Module):
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
-            self.weight = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs)
-            )
-            self.bias = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs)
-            )
+            self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
        else:
            self.register_parameter("weight", None)
            self.register_parameter("bias", None)
@@ -74,15 +70,10 @@ class LayerNorm(nn.Module):
            )

        assert embedding is None
-        return F.layer_norm(
-            input, self.normalized_shape, self.weight, self.bias, self.eps
-        )
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)

    def extra_repr(self) -> str:
-        return (
-            "{normalized_shape}, eps={eps}, "
-            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
-        )
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__)


 class IdentityNorm(nn.Module):
@@ -121,6 +112,7 @@ class TransformerEncoder(nn.Module):
        >>> src = torch.rand(10, 32, 512)
        >>> out = transformer_encoder(src)
    """
+
    __constants__ = ["norm"]

    def __init__(self, encoder_layer, num_layers, norm=None):
@@ -218,13 +210,9 @@ class TransformerEncoderLayer(nn.Module):
        )

        # Implementation of Feedforward model
-        self.linear1 = linear1_feedforward_cls(
-            d_model, dim_feedforward, **factory_kwargs
-        )
+        self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs)
        self.dropout = nn.Dropout(dropout)
-        self.linear2 = linear2_feedforward_cls(
-            dim_feedforward, d_model, **factory_kwargs
-        )
+        self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs)

        self.norm_first = norm_first
        self.dropout1 = nn.Dropout(dropout)
@@ -291,12 +279,8 @@ class TransformerEncoderLayer(nn.Module):

        if src_key_padding_mask is not None:
            _skpm_dtype = src_key_padding_mask.dtype
-            if _skpm_dtype != torch.bool and not torch.is_floating_point(
-                src_key_padding_mask
-            ):
-                raise AssertionError(
-                    "only bool and floating types of key_padding_mask are supported"
-                )
+            if _skpm_dtype != torch.bool and not torch.is_floating_point(src_key_padding_mask):
+                raise AssertionError("only bool and floating types of key_padding_mask are supported")

        if self.norm_first:
            x = x + self._sa_block(
--- a/GPT_SoVITS/AR/modules/transformer_onnx.py
+++ b/GPT_SoVITS/AR/modules/transformer_onnx.py
@@ -42,12 +42,8 @@ class LayerNorm(nn.Module):
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
-            self.weight = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs)
-            )
-            self.bias = nn.Parameter(
-                torch.empty(self.normalized_shape, **factory_kwargs)
-            )
+            self.weight = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            self.bias = nn.Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
        else:
            self.register_parameter("weight", None)
            self.register_parameter("bias", None)
@@ -74,15 +70,10 @@ class LayerNorm(nn.Module):
            )

        assert embedding is None
-        return F.layer_norm(
-            input, self.normalized_shape, self.weight, self.bias, self.eps
-        )
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)

    def extra_repr(self) -> str:
-        return (
-            "{normalized_shape}, eps={eps}, "
-            "elementwise_affine={elementwise_affine}".format(**self.__dict__)
-        )
+        return "{normalized_shape}, eps={eps}, elementwise_affine={elementwise_affine}".format(**self.__dict__)


 class IdentityNorm(nn.Module):
@@ -121,6 +112,7 @@ class TransformerEncoder(nn.Module):
        >>> src = torch.rand(10, 32, 512)
        >>> out = transformer_encoder(src)
    """
+
    __constants__ = ["norm"]

    def __init__(self, encoder_layer, num_layers, norm=None):
@@ -154,6 +146,7 @@ class TransformerEncoder(nn.Module):

 class TransformerEncoderLayer(nn.Module):
    __constants__ = ["batch_first", "norm_first"]
+
    def __init__(
        self,
        d_model: int,
@@ -184,13 +177,9 @@ class TransformerEncoderLayer(nn.Module):
            linear2_cls=linear2_self_attention_cls,
            **factory_kwargs,
        )
-        self.linear1 = linear1_feedforward_cls(
-            d_model, dim_feedforward, **factory_kwargs
-        )
+        self.linear1 = linear1_feedforward_cls(d_model, dim_feedforward, **factory_kwargs)
        self.dropout = nn.Dropout(dropout)
-        self.linear2 = linear2_feedforward_cls(
-            dim_feedforward, d_model, **factory_kwargs
-        )
+        self.linear2 = linear2_feedforward_cls(dim_feedforward, d_model, **factory_kwargs)
        self.norm_first = norm_first
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)