support gpt-sovits v2
support gpt-sovits v2
This commit is contained in:
166
GPT_SoVITS/text/g2pw/dataset.py
Normal file
166
GPT_SoVITS/text/g2pw/dataset.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Credits
|
||||
This code is modified from https://github.com/GitYCC/g2pW
|
||||
"""
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .utils import tokenize_and_map
|
||||
|
||||
ANCHOR_CHAR = '▁'
|
||||
|
||||
|
||||
def prepare_onnx_input(tokenizer,
|
||||
labels: List[str],
|
||||
char2phonemes: Dict[str, List[int]],
|
||||
chars: List[str],
|
||||
texts: List[str],
|
||||
query_ids: List[int],
|
||||
use_mask: bool=False,
|
||||
window_size: int=None,
|
||||
max_len: int=512) -> Dict[str, np.array]:
|
||||
if window_size is not None:
|
||||
truncated_texts, truncated_query_ids = _truncate_texts(
|
||||
window_size=window_size, texts=texts, query_ids=query_ids)
|
||||
input_ids = []
|
||||
token_type_ids = []
|
||||
attention_masks = []
|
||||
phoneme_masks = []
|
||||
char_ids = []
|
||||
position_ids = []
|
||||
|
||||
for idx in range(len(texts)):
|
||||
text = (truncated_texts if window_size else texts)[idx].lower()
|
||||
query_id = (truncated_query_ids if window_size else query_ids)[idx]
|
||||
|
||||
try:
|
||||
tokens, text2token, token2text = tokenize_and_map(
|
||||
tokenizer=tokenizer, text=text)
|
||||
except Exception:
|
||||
print(f'warning: text "{text}" is invalid')
|
||||
return {}
|
||||
|
||||
text, query_id, tokens, text2token, token2text = _truncate(
|
||||
max_len=max_len,
|
||||
text=text,
|
||||
query_id=query_id,
|
||||
tokens=tokens,
|
||||
text2token=text2token,
|
||||
token2text=token2text)
|
||||
|
||||
processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
|
||||
|
||||
input_id = list(
|
||||
np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
||||
token_type_id = list(np.zeros((len(processed_tokens), ), dtype=int))
|
||||
attention_mask = list(np.ones((len(processed_tokens), ), dtype=int))
|
||||
|
||||
query_char = text[query_id]
|
||||
phoneme_mask = [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] \
|
||||
if use_mask else [1] * len(labels)
|
||||
char_id = chars.index(query_char)
|
||||
position_id = text2token[
|
||||
query_id] + 1 # [CLS] token locate at first place
|
||||
|
||||
input_ids.append(input_id)
|
||||
token_type_ids.append(token_type_id)
|
||||
attention_masks.append(attention_mask)
|
||||
phoneme_masks.append(phoneme_mask)
|
||||
char_ids.append(char_id)
|
||||
position_ids.append(position_id)
|
||||
|
||||
outputs = {
|
||||
'input_ids': np.array(input_ids).astype(np.int64),
|
||||
'token_type_ids': np.array(token_type_ids).astype(np.int64),
|
||||
'attention_masks': np.array(attention_masks).astype(np.int64),
|
||||
'phoneme_masks': np.array(phoneme_masks).astype(np.float32),
|
||||
'char_ids': np.array(char_ids).astype(np.int64),
|
||||
'position_ids': np.array(position_ids).astype(np.int64),
|
||||
}
|
||||
return outputs
|
||||
|
||||
|
||||
def _truncate_texts(window_size: int, texts: List[str],
|
||||
query_ids: List[int]) -> Tuple[List[str], List[int]]:
|
||||
truncated_texts = []
|
||||
truncated_query_ids = []
|
||||
for text, query_id in zip(texts, query_ids):
|
||||
start = max(0, query_id - window_size // 2)
|
||||
end = min(len(text), query_id + window_size // 2)
|
||||
truncated_text = text[start:end]
|
||||
truncated_texts.append(truncated_text)
|
||||
|
||||
truncated_query_id = query_id - start
|
||||
truncated_query_ids.append(truncated_query_id)
|
||||
return truncated_texts, truncated_query_ids
|
||||
|
||||
|
||||
def _truncate(max_len: int,
|
||||
text: str,
|
||||
query_id: int,
|
||||
tokens: List[str],
|
||||
text2token: List[int],
|
||||
token2text: List[Tuple[int]]):
|
||||
truncate_len = max_len - 2
|
||||
if len(tokens) <= truncate_len:
|
||||
return (text, query_id, tokens, text2token, token2text)
|
||||
|
||||
token_position = text2token[query_id]
|
||||
|
||||
token_start = token_position - truncate_len // 2
|
||||
token_end = token_start + truncate_len
|
||||
font_exceed_dist = -token_start
|
||||
back_exceed_dist = token_end - len(tokens)
|
||||
if font_exceed_dist > 0:
|
||||
token_start += font_exceed_dist
|
||||
token_end += font_exceed_dist
|
||||
elif back_exceed_dist > 0:
|
||||
token_start -= back_exceed_dist
|
||||
token_end -= back_exceed_dist
|
||||
|
||||
start = token2text[token_start][0]
|
||||
end = token2text[token_end - 1][1]
|
||||
|
||||
return (text[start:end], query_id - start, tokens[token_start:token_end], [
|
||||
i - token_start if i is not None else None
|
||||
for i in text2token[start:end]
|
||||
], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
|
||||
|
||||
|
||||
def get_phoneme_labels(polyphonic_chars: List[List[str]]
|
||||
) -> Tuple[List[str], Dict[str, List[int]]]:
|
||||
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
|
||||
char2phonemes = {}
|
||||
for char, phoneme in polyphonic_chars:
|
||||
if char not in char2phonemes:
|
||||
char2phonemes[char] = []
|
||||
char2phonemes[char].append(labels.index(phoneme))
|
||||
return labels, char2phonemes
|
||||
|
||||
|
||||
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
|
||||
) -> Tuple[List[str], Dict[str, List[int]]]:
|
||||
labels = sorted(
|
||||
list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
|
||||
char2phonemes = {}
|
||||
for char, phoneme in polyphonic_chars:
|
||||
if char not in char2phonemes:
|
||||
char2phonemes[char] = []
|
||||
char2phonemes[char].append(labels.index(f'{char} {phoneme}'))
|
||||
return labels, char2phonemes
|
||||
Reference in New Issue
Block a user