Introduce Docker and Windows CI Workflow, Pre-commit Formatting, and Language Resource Auto-Download (#2351)

* Docker Auto-Build Workflow * Rename * Update * Fix Bugs * Disable Progress Bar When workflows triggered * Fix Wget * Fix Bugs * Fix Bugs * Update Wget * Update Workflows * Accelerate Docker Image Building * Fix Install.sh * Add Skip-Check For Action Runner * Fix Dockerfile * . * . * . * . * Delete File in Runner * Add Sort * Delete More Files * Delete More * . * . * . * Add Pre-Commit Hook Update Docker * Add Code Spell Check * [pre-commit.ci] trigger * [pre-commit.ci] trigger * [pre-commit.ci] trigger * Fix Bugs * . * Disable Progress Bar and Logs while using GitHub Actions * . * . * Fix Bugs * update conda * fix bugs * Fix Bugs * fix bugs * . * . * Quiet Installation * fix bugs * . * fix bug * . * Fix pre-commit.ci and Docker * fix bugs * . * Update Docker & Pre-Commit * fix bugs * Update Req * Update Req * Update OpenCC * update precommit * . * Update .pre-commit-config.yaml * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update Docs and fix bugs * Fix \ * Fix MacOS * . * test * . * Add Tag Alias * . * fix bugs * fix bugs * make image smaller * update pre-commit config * . * . * fix bugs * use miniconda * Fix Wrong Path * . * debug * debug * revert * Fix Bugs * Update Docs, Add Dict Auto Download in install.sh * update docker_build * Update Docs for Install.sh * update docker docs about architecture * Add Xcode-Commandline-Tool Installation * Update Docs 1. Add Missing VC17 2. Modufied the Order of FFmpeg Installation and Requirements Installation 3. Remove Duplicate FFmpeg * Fix Wrong Cuda Version * Update TESTED ENV * Add PYTHONNOUSERSITE(-s) * Fix Wrapper * Update install.sh For Robustness * Ignore .git * Preload CUDNN For Ctranslate2 * Remove Gradio Warnings * Update Colab * Fix OpenCC Problems * Update Win DLL Strategy * Fix Onnxruntime-gpu NVRTC Error * Fix Path Problems * Add Windows Packages Workflow * WIP * WIP * WIP * WIP * WIP * WIP * . * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * WIP * Fix Path * Fix Path * Enable Logging * Set 7-Zip compression level to maximum (-mx=9) * Use Multithread in ONNX Session * Fix Tag Bugs * Add Time * Add Time * Add Time * Compress More * Copy DLL to Solve VC Runtime DLL Missing Issues * Expose FFmpeg Errors, Copy Only Part of Visual C++ Runtime * Update build_windows_packages.ps1 * Update build_windows_packages.ps1 * Update build_windows_packages.ps1 * Update build_windows_packages.ps1 * WIP * WIP * WIP * Update build_windows_packages.ps1 * Update install.sh * Update build_windows_packages.ps1 * Update docker-publish.yaml * Update install.sh * Update Dockerfile * Update docker_build.sh * Update miniconda_install.sh * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update Colab-WebUI.ipynb * Update Colab-Inference.ipynb * Update docker-compose.yaml * 更新 build_windows_packages.ps1 * Update install.sh --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-05-26 05:45:14 +03:00
parent 13055fa569
commit d5e479dad6
58 changed files with 2096 additions and 987 deletions
--- a/GPT_SoVITS/module/data_utils.py
+++ b/GPT_SoVITS/module/data_utils.py
@@ -470,6 +470,7 @@ class TextAudioSpeakerCollateV3:
        # return ssl_padded, spec_padded,mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, wav_padded, wav_lengths,mel_lengths
        return ssl_padded, spec_padded, mel_padded, ssl_lengths, spec_lengths, text_padded, text_lengths, mel_lengths

+
 class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
    """
    1) loads audio, speaker_id, text pairs
@@ -596,7 +597,7 @@ class TextAudioSpeakerLoaderV4(torch.utils.data.Dataset):
            audio_norm, self.filter_length, self.sampling_rate, self.hop_length, self.win_length, center=False
        )
        spec = torch.squeeze(spec, 0)
-        spec1 = spectrogram_torch(audio_norm, 1280,32000, 320, 1280,center=False)
+        spec1 = spectrogram_torch(audio_norm, 1280, 32000, 320, 1280, center=False)
        mel = spec_to_mel_torch(spec1, 1280, 100, 32000, 0, None)
        mel = self.norm_spec(torch.squeeze(mel, 0))
        return spec, mel
@@ -643,7 +644,7 @@ class TextAudioSpeakerCollateV4:
        mel_lengths = torch.LongTensor(len(batch))

        spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
-        mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len*2)
+        mel_padded = torch.FloatTensor(len(batch), batch[0][2].size(0), max_spec_len * 2)
        ssl_padded = torch.FloatTensor(len(batch), batch[0][0].size(1), max_ssl_len)
        text_padded = torch.LongTensor(len(batch), max_text_len)
        # wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
--- a/GPT_SoVITS/module/mel_processing.py
+++ b/GPT_SoVITS/module/mel_processing.py
@@ -39,24 +39,36 @@ hann_window = {}

 def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
    if torch.min(y) < -1.2:
-        print('min value is ', torch.min(y))
+        print("min value is ", torch.min(y))
    if torch.max(y) > 1.2:
-        print('max value is ', torch.max(y))
+        print("max value is ", torch.max(y))

    global hann_window
-    dtype_device = str(y.dtype) + '_' + str(y.device)
+    dtype_device = str(y.dtype) + "_" + str(y.device)
    # wnsize_dtype_device = str(win_size) + '_' + dtype_device
-    key = "%s-%s-%s-%s-%s" %(dtype_device,n_fft, sampling_rate, hop_size, win_size)
+    key = "%s-%s-%s-%s-%s" % (dtype_device, n_fft, sampling_rate, hop_size, win_size)
    # if wnsize_dtype_device not in hann_window:
    if key not in hann_window:
        # hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
        hann_window[key] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)

-    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
    y = y.squeeze(1)
    # spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
-    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[key],
-                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[key],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=False,
+    )

    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8)
    return spec
@@ -64,9 +76,9 @@ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False)

 def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
    global mel_basis
-    dtype_device = str(spec.dtype) + '_' + str(spec.device)
+    dtype_device = str(spec.dtype) + "_" + str(spec.device)
    # fmax_dtype_device = str(fmax) + '_' + dtype_device
-    key = "%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, fmin, fmax)
+    key = "%s-%s-%s-%s-%s-%s" % (dtype_device, n_fft, num_mels, sampling_rate, fmin, fmax)
    # if fmax_dtype_device not in mel_basis:
    if key not in mel_basis:
        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
@@ -78,17 +90,25 @@ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
    return spec


-
 def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
    if torch.min(y) < -1.2:
-        print('min value is ', torch.min(y))
+        print("min value is ", torch.min(y))
    if torch.max(y) > 1.2:
-        print('max value is ', torch.max(y))
+        print("max value is ", torch.max(y))

    global mel_basis, hann_window
-    dtype_device = str(y.dtype) + '_' + str(y.device)
+    dtype_device = str(y.dtype) + "_" + str(y.device)
    # fmax_dtype_device = str(fmax) + '_' + dtype_device
-    fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s"%(dtype_device,n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax)
+    fmax_dtype_device = "%s-%s-%s-%s-%s-%s-%s-%s" % (
+        dtype_device,
+        n_fft,
+        num_mels,
+        sampling_rate,
+        hop_size,
+        win_size,
+        fmin,
+        fmax,
+    )
    # wnsize_dtype_device = str(win_size) + '_' + dtype_device
    wnsize_dtype_device = fmax_dtype_device
    if fmax_dtype_device not in mel_basis:
@@ -97,11 +117,23 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
    if wnsize_dtype_device not in hann_window:
        hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)

-    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
    y = y.squeeze(1)

-    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
-                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[wnsize_dtype_device],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=False,
+    )

    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-8)

--- a/GPT_SoVITS/module/models.py
+++ b/GPT_SoVITS/module/models.py
@@ -414,7 +414,8 @@ class Generator(torch.nn.Module):
        upsample_rates,
        upsample_initial_channel,
        upsample_kernel_sizes,
-        gin_channels=0,is_bias=False,
+        gin_channels=0,
+        is_bias=False,
    ):
        super(Generator, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
@@ -1173,7 +1174,7 @@ class SynthesizerTrnV3(nn.Module):
                quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
                x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge)
        fea = self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest")  ##BCT
+        fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest")  ##BCT
        fea, y_mask_ = self.wns1(
            fea, mel_lengths, ge
        )  ##If the 1-minute fine-tuning works fine, no need to manually adjust the learning rate.
@@ -1196,9 +1197,9 @@ class SynthesizerTrnV3(nn.Module):
            ge = self.ref_enc(refer[:, :704] * refer_mask, refer_mask)
        y_lengths = torch.LongTensor([int(codes.size(2) * 2)]).to(codes.device)
        if speed == 1:
-            sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4))
+            sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4))
        else:
-            sizee = int(codes.size(2) * (3.875 if self.version=="v3"else 4) / speed) + 1
+            sizee = int(codes.size(2) * (3.875 if self.version == "v3" else 4) / speed) + 1
        y_lengths1 = torch.LongTensor([sizee]).to(codes.device)
        text_lengths = torch.LongTensor([text.size(-1)]).to(text.device)

@@ -1207,7 +1208,7 @@ class SynthesizerTrnV3(nn.Module):
            quantized = F.interpolate(quantized, scale_factor=2, mode="nearest")  ##BCT
        x, m_p, logs_p, y_mask = self.enc_p(quantized, y_lengths, text, text_lengths, ge, speed)
        fea = self.bridge(x)
-        fea = F.interpolate(fea, scale_factor=(1.875 if self.version=="v3"else 2), mode="nearest")  ##BCT
+        fea = F.interpolate(fea, scale_factor=(1.875 if self.version == "v3" else 2), mode="nearest")  ##BCT
        ####more wn paramter to learn mel
        fea, y_mask_ = self.wns1(fea, y_lengths1, ge)
        return fea, ge