关于标点符号导致参考泄漏的问题 (#1169)

* punctuation

* update

* update
This commit is contained in:
XXXXRT666
2024-06-10 09:18:35 +01:00
committed by GitHub
parent a3da8e87b5
commit 277b258360
2 changed files with 37 additions and 6 deletions

View File

@@ -8,6 +8,7 @@ sys.path.append(now_dir)
import re
import torch
import LangSegment
from typing import Dict, List, Tuple
from text.cleaner import clean_text
from text import cleaned_text_to_sequence
@@ -17,6 +18,7 @@ from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_
from tools.i18n.i18n import I18nAuto
i18n = I18nAuto()
punctuation = set(['!', '?', '', ',', '.', '-'," "])
def get_first(text:str) -> str:
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
@@ -54,6 +56,7 @@ class TextPreprocessor:
def preprocess(self, text:str, lang:str, text_split_method:str)->List[Dict]:
print(i18n("############ 切分文本 ############"))
texts = self.replace_consecutive_punctuation(texts)
texts = self.pre_seg_text(text, lang, text_split_method)
result = []
print(i18n("############ 提取文本Bert特征 ############"))
@@ -83,6 +86,7 @@ class TextPreprocessor:
text = text.replace("\n\n", "\n")
_texts = text.split("\n")
_texts = self.process_text(_texts)
_texts = merge_short_text_in_array(_texts, 5)
texts = []
@@ -205,6 +209,23 @@ class TextPreprocessor:
return feature
def process_text(self,texts):
_text=[]
if all(text in [None, " ", "\n",""] for text in texts):
raise ValueError(i18n("请输入有效文本"))
for text in texts:
if text in [None, " ", ""]:
pass
else:
_text.append(text)
return _text
def replace_consecutive_punctuation(self,text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
return result