more code refactor

This commit is contained in:
Blaise
2024-01-16 17:14:18 +01:00
parent 0d92575115
commit 0d3d47f3c3
44 changed files with 4516 additions and 2623 deletions

View File

@@ -6,49 +6,56 @@ import cn2an
from pypinyin import lazy_pinyin, Style
import sys
sys.path.append("/data/docker/liujing04/gpt-vits/gpt-vits-master")
from text.symbols import punctuation
from text.tone_sandhi import ToneSandhi
current_file_path = os.path.dirname(__file__)
pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
pinyin_to_symbol_map = {
line.split("\t")[0]: line.strip().split("\t")[1]
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
import jieba.posseg as psg
rep_map = {
'': ',',
'': ',',
'': ',',
'': '.',
'': '!',
'': '?',
'\n': '.',
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
'': ",",
'...': '',
'$': '.',
'/': ',',
'': "-"
"": ",",
"...": "",
"$": ".",
"/": ",",
"": "-",
}
tone_modifier = ToneSandhi()
def replace_punctuation(text):
text = text.replace("", "").replace("","")
pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
text = text.replace("", "").replace("", "")
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(r'[^\u4e00-\u9fa5'+"".join(punctuation)+r']+', '', replaced_text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
)
return replaced_text
def g2p(text):
pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
sentences = [i for i in re.split(pattern, text) if i.strip()!='']
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
phones, word2ph = _g2p(sentences)
return phones, word2ph
@@ -56,10 +63,10 @@ def g2p(text):
def _get_initials_finals(word):
initials = []
finals = []
orig_initials = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
)
for c, v in zip(orig_initials, orig_finals):
initials.append(c)
finals.append(v)
@@ -72,17 +79,16 @@ def _g2p(segments):
for seg in segments:
pinyins = []
# Replace all English words in the sentence
seg = re.sub('[a-zA-Z]+', '', seg)
seg = re.sub("[a-zA-Z]+", "", seg)
seg_cut = psg.lcut(seg)
initials = []
finals = []
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
for word, pos in seg_cut:
if pos == 'eng':
if pos == "eng":
continue
sub_initials, sub_finals = _get_initials_finals(word)
sub_finals = tone_modifier.modified_tone(word, pos,
sub_finals)
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
initials.append(sub_initials)
finals.append(sub_finals)
@@ -91,7 +97,7 @@ def _g2p(segments):
finals = sum(finals, [])
#
for c, v in zip(initials, finals):
raw_pinyin = c+v
raw_pinyin = c + v
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if c == v:
@@ -102,40 +108,40 @@ def _g2p(segments):
v_without_tone = v[:-1]
tone = v[-1]
pinyin = c+v_without_tone
assert tone in '12345'
pinyin = c + v_without_tone
assert tone in "12345"
if c:
# 多音节
v_rep_map = {
"uei": 'ui',
'iou': 'iu',
'uen': 'un',
"uei": "ui",
"iou": "iu",
"uen": "un",
}
if v_without_tone in v_rep_map.keys():
pinyin = c+v_rep_map[v_without_tone]
pinyin = c + v_rep_map[v_without_tone]
else:
# 单音节
pinyin_rep_map = {
'ing': 'ying',
'i': 'yi',
'in': 'yin',
'u': 'wu',
"ing": "ying",
"i": "yi",
"in": "yin",
"u": "wu",
}
if pinyin in pinyin_rep_map.keys():
pinyin = pinyin_rep_map[pinyin]
else:
single_rep_map = {
'v': 'yu',
'e': 'e',
'i': 'y',
'u': 'w',
"v": "yu",
"e": "e",
"i": "y",
"u": "w",
}
if pinyin[0] in single_rep_map.keys():
pinyin = single_rep_map[pinyin[0]]+pinyin[1:]
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
new_c, new_v = pinyin_to_symbol_map[pinyin].split(' ')
new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
new_v = new_v + tone
phone = [new_c, new_v]
word2ph.append(len(phone))
@@ -144,9 +150,8 @@ def _g2p(segments):
return phones_list, word2ph
def text_normalize(text):
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
numbers = re.findall(r"\d+(?:\.?\d+)?", text)
for number in numbers:
text = text.replace(number, cn2an.an2cn(number), 1)
text = replace_punctuation(text)
@@ -154,7 +159,7 @@ def text_normalize(text):
return text
if __name__ == '__main__':
if __name__ == "__main__":
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
text = "呣呣呣~就是…大人的鼹鼠党吧?"
text = "你好"