Introduce Docker and Windows CI Workflow, Pre-commit Formatting, and Language Resource Auto-Download (#2351)

* Docker Auto-Build Workflow

* Rename

* Update

* Fix Bugs

* Disable Progress Bar When workflows triggered

* Fix Wget

* Fix Bugs

* Fix Bugs

* Update Wget

* Update Workflows

* Accelerate Docker Image Building

* Fix Install.sh

* Add Skip-Check For Action Runner

* Fix Dockerfile

* .

* .

* .

* .

* Delete File in Runner

* Add Sort

* Delete More Files

* Delete More

* .

* .

* .

* Add Pre-Commit Hook
Update Docker

* Add Code Spell Check

* [pre-commit.ci] trigger

* [pre-commit.ci] trigger

* [pre-commit.ci] trigger

* Fix Bugs

* .

* Disable Progress Bar and Logs while using GitHub Actions

* .

* .

* Fix Bugs

* update conda

* fix bugs

* Fix Bugs

* fix bugs

* .

* .

* Quiet Installation

* fix bugs

* .

* fix bug

* .

* Fix pre-commit.ci and Docker

* fix bugs

* .

* Update Docker & Pre-Commit

* fix  bugs

* Update Req

* Update Req

* Update OpenCC

* update precommit

* .

* Update .pre-commit-config.yaml

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update Docs and fix bugs

* Fix \

* Fix MacOS

* .

* test

* .

* Add Tag Alias

* .

* fix bugs

* fix bugs

* make image smaller

* update pre-commit config

* .

* .

* fix bugs

* use miniconda

* Fix Wrong Path

* .

* debug

* debug

* revert

* Fix Bugs

* Update Docs, Add Dict Auto Download in install.sh

* update docker_build

* Update Docs for Install.sh

* update docker docs about architecture

* Add Xcode-Commandline-Tool Installation

* Update Docs

1. Add Missing VC17
2. Modufied the Order of FFmpeg Installation and Requirements Installation
3. Remove Duplicate FFmpeg

* Fix Wrong Cuda Version

* Update TESTED ENV

* Add PYTHONNOUSERSITE(-s)

* Fix Wrapper

* Update install.sh For Robustness

* Ignore .git

* Preload CUDNN For Ctranslate2

* Remove Gradio Warnings

* Update Colab

* Fix OpenCC Problems

* Update Win DLL Strategy

* Fix Onnxruntime-gpu NVRTC Error

* Fix Path Problems

* Add Windows Packages Workflow

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* .

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* Fix Path

* Fix Path

* Enable Logging

* Set 7-Zip compression level to maximum (-mx=9)

* Use Multithread in ONNX Session

* Fix Tag Bugs

* Add Time

* Add Time

* Add Time

* Compress More

* Copy DLL to Solve VC Runtime DLL Missing Issues

* Expose FFmpeg Errors, Copy Only Part of Visual C++ Runtime

* Update build_windows_packages.ps1

* Update build_windows_packages.ps1

* Update build_windows_packages.ps1

* Update build_windows_packages.ps1

* WIP

* WIP

* WIP

* Update build_windows_packages.ps1

* Update install.sh

* Update build_windows_packages.ps1

* Update docker-publish.yaml

* Update install.sh

* Update Dockerfile

* Update docker_build.sh

* Update miniconda_install.sh

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update Colab-WebUI.ipynb

* Update Colab-Inference.ipynb

* Update docker-compose.yaml

* 更新 build_windows_packages.ps1

* Update install.sh

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
XXXXRT666
2025-05-26 05:45:14 +03:00
committed by GitHub
parent 13055fa569
commit d5e479dad6
58 changed files with 2096 additions and 987 deletions

View File

@@ -10,6 +10,7 @@ from faster_whisper import WhisperModel
from tqdm import tqdm
from tools.asr.config import check_fw_local_models
from tools.my_utils import load_cudnn
# fmt: off
language_code_list = [
@@ -93,6 +94,8 @@ def execute_asr(input_folder, output_folder, model_size, language, precision):
return output_file_path
load_cudnn()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(

View File

@@ -1,11 +1,15 @@
import ctypes
import os
import traceback
import sys
from pathlib import Path
import ffmpeg
import numpy as np
import gradio as gr
from tools.i18n.i18n import I18nAuto
import numpy as np
import pandas as pd
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto(language=os.environ.get("language", "Auto"))
@@ -15,7 +19,7 @@ def load_audio(file, sr):
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车
if os.path.exists(file) == False:
if os.path.exists(file) is False:
raise RuntimeError("You input a wrong audio path that does not exists, please fix it!")
out, _ = (
ffmpeg.input(file, threads=0)
@@ -23,7 +27,11 @@ def load_audio(file, sr):
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except Exception:
traceback.print_exc()
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True)
) # Expose the Error
raise RuntimeError(i18n("音频加载失败"))
return np.frombuffer(out, np.float32).flatten()
@@ -127,3 +135,97 @@ def check_details(path_list=None, is_train=False, is_dataset_processing=False):
...
else:
gr.Warning(i18n("缺少语义数据集"))
def load_cudnn():
import torch
if not torch.cuda.is_available():
print("[INFO] CUDA is not available, skipping cuDNN setup.")
return
if sys.platform == "win32":
torch_lib_dir = Path(torch.__file__).parent / "lib"
if torch_lib_dir.exists():
os.add_dll_directory(str(torch_lib_dir))
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
matching_files = sorted(torch_lib_dir.glob("cudnn_cnn*.dll"))
if not matching_files:
print(f"[ERROR] No cudnn_cnn*.dll found in {torch_lib_dir}")
return
for dll_path in matching_files:
dll_name = os.path.basename(dll_path)
try:
ctypes.CDLL(dll_name)
print(f"[INFO] Loaded: {dll_name}")
except OSError as e:
print(f"[WARNING] Failed to load {dll_name}: {e}")
else:
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
elif sys.platform == "linux":
site_packages = Path(torch.__file__).resolve().parents[1]
cudnn_dir = site_packages / "nvidia" / "cudnn" / "lib"
if not cudnn_dir.exists():
print(f"[ERROR] cudnn dir not found: {cudnn_dir}")
return
matching_files = sorted(cudnn_dir.glob("libcudnn_cnn*.so*"))
if not matching_files:
print(f"[ERROR] No libcudnn_cnn*.so* found in {cudnn_dir}")
return
for so_path in matching_files:
try:
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
print(f"[INFO] Loaded: {so_path}")
except OSError as e:
print(f"[WARNING] Failed to load {so_path}: {e}")
def load_nvrtc():
import torch
if not torch.cuda.is_available():
print("[INFO] CUDA is not available, skipping nvrtc setup.")
return
if sys.platform == "win32":
torch_lib_dir = Path(torch.__file__).parent / "lib"
if torch_lib_dir.exists():
os.add_dll_directory(str(torch_lib_dir))
print(f"[INFO] Added DLL directory: {torch_lib_dir}")
matching_files = sorted(torch_lib_dir.glob("nvrtc*.dll"))
if not matching_files:
print(f"[ERROR] No nvrtc*.dll found in {torch_lib_dir}")
return
for dll_path in matching_files:
dll_name = os.path.basename(dll_path)
try:
ctypes.CDLL(dll_name)
print(f"[INFO] Loaded: {dll_name}")
except OSError as e:
print(f"[WARNING] Failed to load {dll_name}: {e}")
else:
print(f"[WARNING] Torch lib directory not found: {torch_lib_dir}")
elif sys.platform == "linux":
site_packages = Path(torch.__file__).resolve().parents[1]
nvrtc_dir = site_packages / "nvidia" / "cuda_nvrtc" / "lib"
if not nvrtc_dir.exists():
print(f"[ERROR] nvrtc dir not found: {nvrtc_dir}")
return
matching_files = sorted(nvrtc_dir.glob("libnvrtc*.so*"))
if not matching_files:
print(f"[ERROR] No libnvrtc*.so* found in {nvrtc_dir}")
return
for so_path in matching_files:
try:
ctypes.CDLL(so_path, mode=ctypes.RTLD_GLOBAL) # type: ignore
print(f"[INFO] Loaded: {so_path}")
except OSError as e:
print(f"[WARNING] Failed to load {so_path}: {e}")

View File

@@ -1,7 +1,7 @@
import argparse
import os
import copy
import json
import os
import uuid
try:
@@ -11,8 +11,8 @@ try:
except:
...
import librosa
import gradio as gr
import librosa
import numpy as np
import soundfile
@@ -303,7 +303,7 @@ if __name__ == "__main__":
set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch)
with gr.Blocks() as demo:
with gr.Blocks(analytics_enabled=False) as demo:
with gr.Row():
btn_change_index = gr.Button("Change Index")
btn_submit_change = gr.Button("Submit Text")

View File

@@ -32,18 +32,10 @@ def make_pair(mix_dir, inst_dir):
input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
X_list = sorted(
[
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts
]
[os.path.join(mix_dir, fname) for fname in os.listdir(mix_dir) if os.path.splitext(fname)[1] in input_exts]
)
y_list = sorted(
[
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts
]
[os.path.join(inst_dir, fname) for fname in os.listdir(inst_dir) if os.path.splitext(fname)[1] in input_exts]
)
filelist = list(zip(X_list, y_list))
@@ -65,14 +57,10 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
train_filelist = filelist[:-val_size]
val_filelist = filelist[-val_size:]
else:
train_filelist = [
pair for pair in filelist if list(pair) not in val_filelist
]
train_filelist = [pair for pair in filelist if list(pair) not in val_filelist]
elif split_mode == "subdirs":
if len(val_filelist) != 0:
raise ValueError(
"The `val_filelist` option is not available in `subdirs` mode"
)
raise ValueError("The `val_filelist` option is not available in `subdirs` mode")
train_filelist = make_pair(
os.path.join(dataset_dir, "training/mixtures"),
@@ -91,9 +79,7 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(
X[idx], y[idx], reduction_mask
)
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
if np.random.uniform() < 0.5:
# swap channel
@@ -152,9 +138,7 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
cropsize, sr, hop_length, n_fft, offset
)
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(cropsize, sr, hop_length, n_fft, offset)
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):

View File

@@ -63,9 +63,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@@ -91,24 +89,14 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -63,9 +63,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@@ -91,24 +89,14 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -63,9 +63,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@@ -91,24 +89,14 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -63,9 +63,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@@ -91,30 +89,16 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -63,9 +63,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@@ -91,30 +89,16 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -63,9 +63,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
@@ -91,30 +89,16 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
self.conv3 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv6 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.conv7 = SeperableConv2DBNActiv(nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = nn.Sequential(Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1))
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -40,9 +40,7 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
super(Decoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
# self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
@@ -72,23 +70,15 @@ class ASPPModule(nn.Module):
Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
self.conv3 = Conv2DBNActiv(
nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = Conv2DBNActiv(
nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = Conv2DBNActiv(
nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv3 = Conv2DBNActiv(nin, nout, 3, 1, dilations[0], dilations[0], activ=activ)
self.conv4 = Conv2DBNActiv(nin, nout, 3, 1, dilations[1], dilations[1], activ=activ)
self.conv5 = Conv2DBNActiv(nin, nout, 3, 1, dilations[2], dilations[2], activ=activ)
self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode="bilinear", align_corners=True)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)
@@ -106,12 +96,8 @@ class LSTMModule(nn.Module):
def __init__(self, nin_conv, nin_lstm, nout_lstm):
super(LSTMModule, self).__init__()
self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
self.lstm = nn.LSTM(
input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
)
self.dense = nn.Sequential(
nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
)
self.lstm = nn.LSTM(input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True)
self.dense = nn.Sequential(nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU())
def forward(self, x):
N, _, nbins, nframes = x.size()

View File

@@ -1,5 +1,4 @@
import json
import os
import pathlib
default_param = {}
@@ -48,9 +47,7 @@ class ModelParameters(object):
import zipfile
with zipfile.ZipFile(config_path, "r") as zip:
self.param = json.loads(
zip.read("param.json"), object_pairs_hook=int_keys
)
self.param = json.loads(zip.read("param.json"), object_pairs_hook=int_keys)
elif ".json" == pathlib.Path(config_path).suffix:
with open(config_path, "r") as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
@@ -65,5 +62,5 @@ class ModelParameters(object):
"stereo_n",
"reverse",
]:
if not k in self.param:
if k not in self.param:
self.param[k] = False

View File

@@ -3,8 +3,6 @@ import torch
import torch.nn.functional as F
from torch import nn
from . import spec_utils
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):

View File

@@ -1,4 +1,3 @@
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

View File

@@ -1,4 +1,3 @@
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

View File

@@ -6,9 +6,7 @@ from . import layers_new
class BaseNet(nn.Module):
def __init__(
self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
):
def __init__(self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))):
super(BaseNet, self).__init__()
self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
@@ -56,21 +54,15 @@ class CascadedNet(nn.Module):
layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
)
self.stg1_high_band_net = BaseNet(
2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
)
self.stg1_high_band_net = BaseNet(2, nout // 4, self.nin_lstm // 2, nout_lstm // 2)
self.stg2_low_band_net = nn.Sequential(
BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
)
self.stg2_high_band_net = BaseNet(
nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
)
self.stg2_high_band_net = BaseNet(nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2)
self.stg3_full_band_net = BaseNet(
3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
)
self.stg3_full_band_net = BaseNet(3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm)
self.out = nn.Conv2d(nout, 2, 1, bias=False)
self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)

View File

@@ -27,9 +27,7 @@ def crop_center(h1, h2):
return h1
def wave_to_spectrogram(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
@@ -43,7 +41,7 @@ def wave_to_spectrogram(
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length)
spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
spec = np.asfortranarray([spec_left, spec_right])
@@ -51,9 +49,7 @@ def wave_to_spectrogram(
return spec
def wave_to_spectrogram_mt(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
import threading
if reverse:
@@ -103,21 +99,13 @@ def combine_spectrograms(specs, mp):
raise ValueError("Too much bins")
# lowpass fiter
if (
mp.param["pre_filter_start"] > 0
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if mp.param["pre_filter_start"] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1:
spec_c = fft_lp_filter(
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
)
spec_c = fft_lp_filter(spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"])
else:
gp = 1
for b in range(
mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
):
g = math.pow(
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
)
for b in range(mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]):
g = math.pow(10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0)
gp = g
spec_c[:, b, :] *= g
@@ -189,9 +177,7 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
else:
e += fade_size
mag[:, :, s + fade_size : e - fade_size] += ref[
:, :, s + fade_size : e - fade_size
]
mag[:, :, s + fade_size : e - fade_size] += ref[:, :, s + fade_size : e - fade_size]
old_e = e
return mag
@@ -207,9 +193,7 @@ def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = "mph{}".format(
hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
)
cache_dir = "mph{}".format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest())
mix_cache_dir = os.path.join("cache", cache_dir)
inst_cache_dir = os.path.join("cache", cache_dir)
@@ -230,31 +214,27 @@ def cache_or_load(mix_path, inst_path, mp):
if d == len(mp.param["band"]): # high-end band
X_wave[d], _ = librosa.load(
mix_path,
sr = bp["sr"],
mono = False,
dtype = np.float32,
res_type = bp["res_type"]
mix_path, sr=bp["sr"], mono=False, dtype=np.float32, res_type=bp["res_type"]
)
y_wave[d], _ = librosa.load(
inst_path,
sr = bp["sr"],
mono = False,
dtype = np.float32,
res_type = bp["res_type"],
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
orig_sr = mp.param["band"][d + 1]["sr"],
target_sr = bp["sr"],
res_type = bp["res_type"],
orig_sr=mp.param["band"][d + 1]["sr"],
target_sr=bp["sr"],
res_type=bp["res_type"],
)
y_wave[d] = librosa.resample(
y_wave[d + 1],
orig_sr = mp.param["band"][d + 1]["sr"],
target_sr = bp["sr"],
res_type = bp["res_type"],
orig_sr=mp.param["band"][d + 1]["sr"],
target_sr=bp["sr"],
res_type=bp["res_type"],
)
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
@@ -302,9 +282,7 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
elif mid_side_b2:
return np.asfortranarray(
[
@@ -326,9 +304,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
global wave_left
wave_left = librosa.istft(**kwargs)
thread = threading.Thread(
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
)
thread = threading.Thread(target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length})
thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join()
@@ -336,9 +312,7 @@ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
elif mid_side_b2:
return np.asfortranarray(
[
@@ -357,21 +331,15 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
for d in range(1, bands_n + 1):
bp = mp.param["band"][d]
spec_s = np.ndarray(
shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
)
spec_s = np.ndarray(shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex)
h = bp["crop_stop"] - bp["crop_start"]
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
:, offset : offset + h, :
]
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[:, offset : offset + h, :]
offset += h
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp["n_fft"] // 2
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
:, :extra_bins_h, :
]
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[:, :extra_bins_h, :]
if bp["hpf_start"] > 0:
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
if bands_n == 1:
@@ -405,9 +373,9 @@ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
mp.param["mid_side_b2"],
mp.param["reverse"],
),
orig_sr = bp["sr"],
target_sr = sr,
res_type = "sinc_fastest",
orig_sr=bp["sr"],
target_sr=sr,
res_type="sinc_fastest",
)
else: # mid
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
@@ -456,10 +424,7 @@ def mirroring(a, spec_m, input_high_end, mp):
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10,
:,
]
),
@@ -467,19 +432,14 @@ def mirroring(a, spec_m, input_high_end, mp):
)
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
return np.where(
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
)
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
if "mirroring2" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
mp.param["pre_filter_start"] - 10 - input_high_end.shape[1] : mp.param["pre_filter_start"] - 10,
:,
]
),
@@ -528,7 +488,6 @@ def istft(spec, hl):
if __name__ == "__main__":
import argparse
import sys
import time
import cv2
@@ -573,10 +532,10 @@ if __name__ == "__main__":
if d == len(mp.param["band"]): # high-end band
wave[d], _ = librosa.load(
args.input[i],
sr = bp["sr"],
mono = False,
dtype = np.float32,
res_type = bp["res_type"],
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)
if len(wave[d].shape) == 1: # mono to stereo
@@ -584,9 +543,9 @@ if __name__ == "__main__":
else: # lower bands
wave[d] = librosa.resample(
wave[d + 1],
orig_sr = mp.param["band"][d + 1]["sr"],
target_sr = bp["sr"],
res_type = bp["res_type"],
orig_sr=mp.param["band"][d + 1]["sr"],
target_sr=bp["sr"],
res_type=bp["res_type"],
)
spec[d] = wave_to_spectrogram(

View File

@@ -27,9 +27,7 @@ def inference(X_spec, device, model, aggressiveness, data):
data : dic configs
"""
def _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
):
def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True):
model.eval()
with torch.no_grad():
preds = []
@@ -39,9 +37,7 @@ def inference(X_spec, device, model, aggressiveness, data):
total_iterations = sum(iterations)
for i in tqdm(range(n_window)):
start = i * roi_size
X_mag_window = X_mag_pad[
None, :, :, start : start + data["window_size"]
]
X_mag_window = X_mag_pad[None, :, :, start : start + data["window_size"]]
X_mag_window = torch.from_numpy(X_mag_window)
if is_half:
X_mag_window = X_mag_window.half()
@@ -76,9 +72,7 @@ def inference(X_spec, device, model, aggressiveness, data):
is_half = True
else:
is_half = False
pred = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
pred = pred[:, :, :n_frame]
if data["tta"]:
@@ -88,9 +82,7 @@ def inference(X_spec, device, model, aggressiveness, data):
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
pred_tta = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred_tta = _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half)
pred_tta = pred_tta[:, :, roi_size // 2 :]
pred_tta = pred_tta[:, :, :n_frame]

View File

@@ -1,26 +1,22 @@
import logging
import os
import traceback
import gradio as gr
import logging
from tools.i18n.i18n import I18nAuto
from tools.my_utils import clean_path
i18n = I18nAuto()
logger = logging.getLogger(__name__)
import sys
import ffmpeg
import torch
import sys
from bsroformer import Roformer_Loader
from mdxnet import MDXNetDereverb
from vr import AudioPre, AudioPreDeEcho
from bsroformer import Roformer_Loader
try:
import gradio.analytics as analytics
analytics.version_check = lambda: None
except:
...
weight_uvr5_root = "tools/uvr5/uvr5_weights"
uvr5_names = []
@@ -129,7 +125,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
yield "\n".join(infos)
with gr.Blocks(title="UVR5 WebUI") as app:
with gr.Blocks(title="UVR5 WebUI", analytics_enabled=False) as app:
gr.Markdown(
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
+ "<br>"