关于标点符号导致参考泄漏的问题 (#1169)

* punctuation * update * update
2024-06-10 09:18:35 +01:00
parent a3da8e87b5
commit 277b258360
2 changed files with 37 additions and 6 deletions
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@@ -8,6 +8,7 @@ sys.path.append(now_dir)
 import re
 import torch
 import LangSegment
+
 from typing import Dict, List, Tuple
 from text.cleaner import clean_text
 from text import cleaned_text_to_sequence
@@ -17,6 +18,7 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_
 from tools.i18n.i18n import I18nAuto

 i18n = I18nAuto()
+punctuation = set(['!', '?', '…', ',', '.', '-'," "])

 def get_first(text:str) -> str:
    pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
@@ -54,6 +56,7 @@ class TextPreprocessor:
        
    def preprocess(self, text:str, lang:str, text_split_method:str)->List[Dict]:
        print(i18n("############ 切分文本 ############"))
+        texts = self.replace_consecutive_punctuation(texts)
        texts = self.pre_seg_text(text, lang, text_split_method)
        result = []
        print(i18n("############ 提取文本Bert特征 ############"))
@@ -83,6 +86,7 @@ class TextPreprocessor:
            text = text.replace("\n\n", "\n")

        _texts = text.split("\n")
+        _texts = self.process_text(_texts)
        _texts = merge_short_text_in_array(_texts, 5)
        texts = []

@@ -205,6 +209,23 @@ class TextPreprocessor:

        return feature
    
+    def process_text(self,texts):
+        _text=[]
+        if all(text in [None, " ", "\n",""] for text in texts):
+            raise ValueError(i18n("请输入有效文本"))
+        for text in texts:
+            if text in  [None, " ", ""]:
+                pass
+            else:
+                _text.append(text)
+        return _text
+    
+
+    def replace_consecutive_punctuation(self,text):
+        punctuations = ''.join(re.escape(p) for p in punctuation)
+        pattern = f'([{punctuations}])([{punctuations}])+'
+        result = re.sub(pattern, r'\1', text)
+        return result