Better normlization (#1351)

2024-07-27 16:03:43 +08:00
parent f042030cca
commit e851ae34c9
6 changed files with 93 additions and 18 deletions
--- a/GPT_SoVITS/text/chinese.py
+++ b/GPT_SoVITS/text/chinese.py
@@ -48,12 +48,19 @@ def replace_punctuation(text):
    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)

    replaced_text = re.sub(
-        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
+        r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
    )

    return replaced_text


+def replace_consecutive_punctuation(text):
+    punctuations = ''.join(re.escape(p) for p in punctuation)
+    pattern = f'([{punctuations}])([{punctuations}])+'
+    result = re.sub(pattern, r'\1', text)
+    return result
+
+
 def g2p(text):
    pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
    sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
@@ -158,6 +165,9 @@ def text_normalize(text):
    dest_text = ""
    for sentence in sentences:
        dest_text += replace_punctuation(sentence)
+
+    # 避免重复标点引起的参考泄露
+    dest_text = replace_consecutive_punctuation(dest_text)
    return dest_text