more code refactor
This commit is contained in:
@@ -11,23 +11,30 @@ logging.getLogger("numba").setLevel(logging.WARNING)
|
||||
from transformers import (
|
||||
Wav2Vec2FeatureExtractor,
|
||||
HubertModel,
|
||||
Wav2Vec2Model,
|
||||
)
|
||||
|
||||
import utils
|
||||
import torch.nn as nn
|
||||
|
||||
cnhubert_base_path=None
|
||||
cnhubert_base_path = None
|
||||
|
||||
|
||||
class CNHubert(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.model = HubertModel.from_pretrained(cnhubert_base_path)
|
||||
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(cnhubert_base_path)
|
||||
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
|
||||
cnhubert_base_path
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
|
||||
input_values = self.feature_extractor(
|
||||
x, return_tensors="pt", sampling_rate=16000
|
||||
).input_values.to(x.device)
|
||||
feats = self.model(input_values)["last_hidden_state"]
|
||||
return feats
|
||||
|
||||
|
||||
# class CNHubertLarge(nn.Module):
|
||||
# def __init__(self):
|
||||
# super().__init__()
|
||||
@@ -59,12 +66,12 @@ class CNHubert(nn.Module):
|
||||
# return feats
|
||||
|
||||
|
||||
|
||||
def get_model():
|
||||
model = CNHubert()
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
# def get_large_model():
|
||||
# model = CNHubertLarge()
|
||||
# model.eval()
|
||||
@@ -80,18 +87,18 @@ def get_model():
|
||||
# model.eval()
|
||||
# return model
|
||||
|
||||
|
||||
def get_content(hmodel, wav_16k_tensor):
|
||||
with torch.no_grad():
|
||||
feats = hmodel(wav_16k_tensor)
|
||||
return feats.transpose(1,2)
|
||||
return feats.transpose(1, 2)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
model = get_model()
|
||||
src_path = "/Users/Shared/原音频2.wav"
|
||||
wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
|
||||
model = model
|
||||
wav_16k_tensor = wav_16k_tensor
|
||||
feats = get_content(model,wav_16k_tensor)
|
||||
feats = get_content(model, wav_16k_tensor)
|
||||
print(feats.shape)
|
||||
|
||||
|
||||
@@ -3,20 +3,23 @@ import torch
|
||||
|
||||
def get_model():
|
||||
import whisper
|
||||
model = whisper.load_model("small", device='cpu')
|
||||
|
||||
model = whisper.load_model("small", device="cpu")
|
||||
|
||||
return model.encoder
|
||||
|
||||
|
||||
def get_content(model=None, wav_16k_tensor=None):
|
||||
from whisper import log_mel_spectrogram, pad_or_trim
|
||||
|
||||
dev = next(model.parameters()).device
|
||||
mel = log_mel_spectrogram(wav_16k_tensor).to(dev)[:, :3000]
|
||||
# if torch.cuda.is_available():
|
||||
# mel = mel.to(torch.float16)
|
||||
feature_len = mel.shape[-1] // 2
|
||||
assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频"
|
||||
assert mel.shape[-1] < 3000, "输入音频过长,只允许输入30以内音频"
|
||||
with torch.no_grad():
|
||||
feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[:1, :feature_len, :].transpose(1,2)
|
||||
feature = model(pad_or_trim(mel, 3000).unsqueeze(0))[
|
||||
:1, :feature_len, :
|
||||
].transpose(1, 2)
|
||||
return feature
|
||||
|
||||
|
||||
Reference in New Issue
Block a user