Better normlization (#1351)

This commit is contained in:
KamioRinn
2024-07-27 16:03:43 +08:00
committed by GitHub
parent f042030cca
commit e851ae34c9
6 changed files with 93 additions and 18 deletions

View File

@@ -4,7 +4,7 @@ import re
import wordsegment
from g2p_en import G2p
from string import punctuation
from text.symbols import punctuation
from text import symbols
@@ -110,6 +110,13 @@ def replace_phs(phs):
return phs_new
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
return result
def read_dict():
g2p_dict = {}
start_line = 49
@@ -234,6 +241,9 @@ def text_normalize(text):
text = re.sub(r"(?i)i\.e\.", "that is", text)
text = re.sub(r"(?i)e\.g\.", "for example", text)
# 避免重复标点引起的参考泄露
text = replace_consecutive_punctuation(text)
return text