Better normlization (#1351)
This commit is contained in:
@@ -4,7 +4,7 @@ import re
|
||||
import wordsegment
|
||||
from g2p_en import G2p
|
||||
|
||||
from string import punctuation
|
||||
from text.symbols import punctuation
|
||||
|
||||
from text import symbols
|
||||
|
||||
@@ -110,6 +110,13 @@ def replace_phs(phs):
|
||||
return phs_new
|
||||
|
||||
|
||||
def replace_consecutive_punctuation(text):
|
||||
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||
pattern = f'([{punctuations}])([{punctuations}])+'
|
||||
result = re.sub(pattern, r'\1', text)
|
||||
return result
|
||||
|
||||
|
||||
def read_dict():
|
||||
g2p_dict = {}
|
||||
start_line = 49
|
||||
@@ -234,6 +241,9 @@ def text_normalize(text):
|
||||
text = re.sub(r"(?i)i\.e\.", "that is", text)
|
||||
text = re.sub(r"(?i)e\.g\.", "for example", text)
|
||||
|
||||
# 避免重复标点引起的参考泄露
|
||||
text = replace_consecutive_punctuation(text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user