more code refactor
This commit is contained in:
@@ -6,49 +6,56 @@ import cn2an
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
|
||||
import sys
|
||||
|
||||
sys.path.append("/data/docker/liujing04/gpt-vits/gpt-vits-master")
|
||||
|
||||
from text.symbols import punctuation
|
||||
from text.tone_sandhi import ToneSandhi
|
||||
|
||||
current_file_path = os.path.dirname(__file__)
|
||||
pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
|
||||
open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
|
||||
pinyin_to_symbol_map = {
|
||||
line.split("\t")[0]: line.strip().split("\t")[1]
|
||||
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
||||
}
|
||||
|
||||
import jieba.posseg as psg
|
||||
|
||||
|
||||
rep_map = {
|
||||
':': ',',
|
||||
';': ',',
|
||||
',': ',',
|
||||
'。': '.',
|
||||
'!': '!',
|
||||
'?': '?',
|
||||
'\n': '.',
|
||||
":": ",",
|
||||
";": ",",
|
||||
",": ",",
|
||||
"。": ".",
|
||||
"!": "!",
|
||||
"?": "?",
|
||||
"\n": ".",
|
||||
"·": ",",
|
||||
'、': ",",
|
||||
'...': '…',
|
||||
'$': '.',
|
||||
'/': ',',
|
||||
'—': "-"
|
||||
"、": ",",
|
||||
"...": "…",
|
||||
"$": ".",
|
||||
"/": ",",
|
||||
"—": "-",
|
||||
}
|
||||
|
||||
tone_modifier = ToneSandhi()
|
||||
|
||||
|
||||
def replace_punctuation(text):
|
||||
text = text.replace("嗯", "恩").replace("呣","母")
|
||||
pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
|
||||
text = text.replace("嗯", "恩").replace("呣", "母")
|
||||
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(r'[^\u4e00-\u9fa5'+"".join(punctuation)+r']+', '', replaced_text)
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
|
||||
return replaced_text
|
||||
|
||||
|
||||
def g2p(text):
|
||||
pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
|
||||
sentences = [i for i in re.split(pattern, text) if i.strip()!='']
|
||||
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
||||
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
||||
phones, word2ph = _g2p(sentences)
|
||||
return phones, word2ph
|
||||
|
||||
@@ -56,10 +63,10 @@ def g2p(text):
|
||||
def _get_initials_finals(word):
|
||||
initials = []
|
||||
finals = []
|
||||
orig_initials = lazy_pinyin(
|
||||
word, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||
orig_finals = lazy_pinyin(
|
||||
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
|
||||
)
|
||||
for c, v in zip(orig_initials, orig_finals):
|
||||
initials.append(c)
|
||||
finals.append(v)
|
||||
@@ -72,17 +79,16 @@ def _g2p(segments):
|
||||
for seg in segments:
|
||||
pinyins = []
|
||||
# Replace all English words in the sentence
|
||||
seg = re.sub('[a-zA-Z]+', '', seg)
|
||||
seg = re.sub("[a-zA-Z]+", "", seg)
|
||||
seg_cut = psg.lcut(seg)
|
||||
initials = []
|
||||
finals = []
|
||||
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
||||
for word, pos in seg_cut:
|
||||
if pos == 'eng':
|
||||
if pos == "eng":
|
||||
continue
|
||||
sub_initials, sub_finals = _get_initials_finals(word)
|
||||
sub_finals = tone_modifier.modified_tone(word, pos,
|
||||
sub_finals)
|
||||
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
||||
initials.append(sub_initials)
|
||||
finals.append(sub_finals)
|
||||
|
||||
@@ -91,7 +97,7 @@ def _g2p(segments):
|
||||
finals = sum(finals, [])
|
||||
#
|
||||
for c, v in zip(initials, finals):
|
||||
raw_pinyin = c+v
|
||||
raw_pinyin = c + v
|
||||
# NOTE: post process for pypinyin outputs
|
||||
# we discriminate i, ii and iii
|
||||
if c == v:
|
||||
@@ -102,40 +108,40 @@ def _g2p(segments):
|
||||
v_without_tone = v[:-1]
|
||||
tone = v[-1]
|
||||
|
||||
pinyin = c+v_without_tone
|
||||
assert tone in '12345'
|
||||
pinyin = c + v_without_tone
|
||||
assert tone in "12345"
|
||||
|
||||
if c:
|
||||
# 多音节
|
||||
v_rep_map = {
|
||||
"uei": 'ui',
|
||||
'iou': 'iu',
|
||||
'uen': 'un',
|
||||
"uei": "ui",
|
||||
"iou": "iu",
|
||||
"uen": "un",
|
||||
}
|
||||
if v_without_tone in v_rep_map.keys():
|
||||
pinyin = c+v_rep_map[v_without_tone]
|
||||
pinyin = c + v_rep_map[v_without_tone]
|
||||
else:
|
||||
# 单音节
|
||||
pinyin_rep_map = {
|
||||
'ing': 'ying',
|
||||
'i': 'yi',
|
||||
'in': 'yin',
|
||||
'u': 'wu',
|
||||
"ing": "ying",
|
||||
"i": "yi",
|
||||
"in": "yin",
|
||||
"u": "wu",
|
||||
}
|
||||
if pinyin in pinyin_rep_map.keys():
|
||||
pinyin = pinyin_rep_map[pinyin]
|
||||
else:
|
||||
single_rep_map = {
|
||||
'v': 'yu',
|
||||
'e': 'e',
|
||||
'i': 'y',
|
||||
'u': 'w',
|
||||
"v": "yu",
|
||||
"e": "e",
|
||||
"i": "y",
|
||||
"u": "w",
|
||||
}
|
||||
if pinyin[0] in single_rep_map.keys():
|
||||
pinyin = single_rep_map[pinyin[0]]+pinyin[1:]
|
||||
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
||||
|
||||
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
||||
new_c, new_v = pinyin_to_symbol_map[pinyin].split(' ')
|
||||
new_c, new_v = pinyin_to_symbol_map[pinyin].split(" ")
|
||||
new_v = new_v + tone
|
||||
phone = [new_c, new_v]
|
||||
word2ph.append(len(phone))
|
||||
@@ -144,9 +150,8 @@ def _g2p(segments):
|
||||
return phones_list, word2ph
|
||||
|
||||
|
||||
|
||||
def text_normalize(text):
|
||||
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
|
||||
numbers = re.findall(r"\d+(?:\.?\d+)?", text)
|
||||
for number in numbers:
|
||||
text = text.replace(number, cn2an.an2cn(number), 1)
|
||||
text = replace_punctuation(text)
|
||||
@@ -154,7 +159,7 @@ def text_normalize(text):
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
|
||||
text = "呣呣呣~就是…大人的鼹鼠党吧?"
|
||||
text = "你好"
|
||||
|
||||
Reference in New Issue
Block a user