Add en_normalization and fix LangSegmenter (#2062)

2025-02-17 18:41:30 +08:00
parent c70daefea2
commit c17dd642c7
5 changed files with 301 additions and 41 deletions
--- a/GPT_SoVITS/text/english.py
+++ b/GPT_SoVITS/text/english.py
@@ -10,7 +10,7 @@ from text.symbols2 import symbols

 import unicodedata
 from builtins import str as unicode
-from g2p_en.expand import normalize_numbers
+from text.en_normalization.expend import normalize
 from nltk.tokenize import TweetTokenizer
 word_tokenize = TweetTokenizer().tokenize
 from nltk import pos_tag
@@ -22,6 +22,17 @@ CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
 CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
 NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")

+
+# 适配中文及 g2p_en 标点
+rep_map = {
+    "[;:：，；]": ",",
+    '["’]': "'",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+}
+
+
 arpa = {
    "AH0",
    "S",
@@ -220,32 +231,16 @@ def get_namedict():

 def text_normalize(text):
    # todo: eng text normalize
-    # 适配中文及 g2p_en 标点
-    rep_map = {
-        "[;:：，；]": ",",
-        '["’]': "'",
-        "。": ".",
-        "！": "!",
-        "？": "?",
-    }
-    for p, r in rep_map.items():
-        text = re.sub(p, r, text)

-    # 来自 g2p_en 文本格式化处理
-    # 增加大写兼容
-    # 增加纯大写单词拆分
+    # 效果相同，和 chinese.py 保持一致
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    text = pattern.sub(lambda x: rep_map[x.group()], text)
+
    text = unicode(text)
-    text = normalize_numbers(text)
-    text = ''.join(char for char in unicodedata.normalize('NFD', text)
-                    if unicodedata.category(char) != 'Mn')  # Strip accents
-    text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
-    text = re.sub(r"(?i)i\.e\.", "that is", text)
-    text = re.sub(r"(?i)e\.g\.", "for example", text)
-    text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
+    text = normalize(text)

    # 避免重复标点引起的参考泄露
    text = replace_consecutive_punctuation(text)
-
    return text