All in one! 合并main分支和fast_inference_分支 (#1490)

* 合并main分支和fast_inference_分支 * 修复一些bug
2024-08-20 22:19:04 +08:00
parent 9c75f35ece
commit 52c50c6c81
12 changed files with 2653 additions and 76 deletions
--- a/GPT_SoVITS/TTS_infer_pack/TTS.py
+++ b/GPT_SoVITS/TTS_infer_pack/TTS.py
--- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
+++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py
@@ -0,0 +1,244 @@
+
+import os, sys
+
+from tqdm import tqdm
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+import re
+import torch
+import LangSegment
+from text import chinese
+from typing import Dict, List, Tuple
+from text.cleaner import clean_text
+from text import cleaned_text_to_sequence
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+from TTS_infer_pack.text_segmentation_method import split_big_text, splits, get_method as get_seg_method
+
+from tools.i18n.i18n import I18nAuto, scan_language_list
+
+language=os.environ.get("language","Auto")
+language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
+i18n = I18nAuto(language=language)
+punctuation = set(['!', '?', '…', ',', '.', '-'," "])
+
+def get_first(text:str) -> str:
+    pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
+    text = re.split(pattern, text)[0].strip()
+    return text
+
+def merge_short_text_in_array(texts:str, threshold:int) -> list:
+    if (len(texts)) < 2:
+        return texts
+    result = []
+    text = ""
+    for ele in texts:
+        text += ele
+        if len(text) >= threshold:
+            result.append(text)
+            text = ""
+    if (len(text) > 0):
+        if len(result) == 0:
+            result.append(text)
+        else:
+            result[len(result) - 1] += text
+    return result
+
+
+
+
+
+class TextPreprocessor:
+    def __init__(self, bert_model:AutoModelForMaskedLM, 
+                 tokenizer:AutoTokenizer, device:torch.device):
+        self.bert_model = bert_model
+        self.tokenizer = tokenizer
+        self.device = device
+        
+    def preprocess(self, text:str, lang:str, text_split_method:str, version:str="v1")->List[Dict]:
+        print(i18n("############ 切分文本 ############"))
+        text = self.replace_consecutive_punctuation(text) # 变量命名应该是写错了
+        texts = self.pre_seg_text(text, lang, text_split_method)
+        result = []
+        print(i18n("############ 提取文本Bert特征 ############"))
+        for text in tqdm(texts):
+            phones, bert_features, norm_text = self.segment_and_extract_feature_for_text(text, lang, version)
+            if phones is None or norm_text=="":
+                continue
+            res={
+                "phones": phones,
+                "bert_features": bert_features,
+                "norm_text": norm_text,
+            }
+            result.append(res)
+        return result
+
+    def pre_seg_text(self, text:str, lang:str, text_split_method:str):
+        text = text.strip("\n")
+        if len(text) == 0:
+            return []
+        if (text[0] not in splits and len(get_first(text)) < 4): 
+            text = "。" + text if lang != "en" else "." + text
+        print(i18n("实际输入的目标文本:"))
+        print(text)
+        
+        seg_method = get_seg_method(text_split_method)
+        text = seg_method(text)
+        
+        while "\n\n" in text:
+            text = text.replace("\n\n", "\n")
+
+        _texts = text.split("\n")
+        _texts = self.filter_text(_texts)
+        _texts = merge_short_text_in_array(_texts, 5)
+        texts = []
+
+        
+        for text in _texts:
+            # 解决输入目标文本的空行导致报错的问题
+            if (len(text.strip()) == 0):
+               continue
+            if not re.sub("\W+", "", text):       
+                # 检测一下，如果是纯符号，就跳过。
+                continue
+            if (text[-1] not in splits): text += "。" if lang != "en" else "."
+            
+            # 解决句子过长导致Bert报错的问题
+            if (len(text) > 510):
+                texts.extend(split_big_text(text))
+            else:
+                texts.append(text)
+            
+        print(i18n("实际输入的目标文本(切句后):"))
+        print(texts)
+        return texts
+    
+    def segment_and_extract_feature_for_text(self, text:str, language:str, version:str="v1")->Tuple[list, torch.Tensor, str]:
+        return self.get_phones_and_bert(text, language, version)
+        
+    def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False):
+        if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
+            language = language.replace("all_","")
+            if language == "en":
+                LangSegment.setfilters(["en"])
+                formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
+            else:
+                # 因无法区别中日韩文汉字,以用户输入为准
+                formattext = text
+            while "  " in formattext:
+                formattext = formattext.replace("  ", " ")
+            if language == "zh":
+                if re.search(r'[A-Za-z]', formattext):
+                    formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
+                    formattext = chinese.mix_text_normalize(formattext)
+                    return self.get_phones_and_bert(formattext,"zh",version)
+                else:
+                    phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
+                    bert = self.get_bert_feature(norm_text, word2ph).to(self.device)
+            elif language == "yue" and re.search(r'[A-Za-z]', formattext):
+                    formattext = re.sub(r'[a-z]', lambda x: x.group(0).upper(), formattext)
+                    formattext = chinese.mix_text_normalize(formattext)
+                    return self.get_phones_and_bert(formattext,"yue",version)
+            else:
+                phones, word2ph, norm_text = self.clean_text_inf(formattext, language, version)
+                bert = torch.zeros(
+                    (1024, len(phones)),
+                    dtype=torch.float32,
+                ).to(self.device)
+        elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
+            textlist=[]
+            langlist=[]
+            LangSegment.setfilters(["zh","ja","en","ko"])
+            if language == "auto":
+                for tmp in LangSegment.getTexts(text):
+                    langlist.append(tmp["lang"])
+                    textlist.append(tmp["text"])
+            elif language == "auto_yue":
+                for tmp in LangSegment.getTexts(text):
+                    if tmp["lang"] == "zh":
+                        tmp["lang"] = "yue"
+                    langlist.append(tmp["lang"])
+                    textlist.append(tmp["text"])
+            else:
+                for tmp in LangSegment.getTexts(text):
+                    if tmp["lang"] == "en":
+                        langlist.append(tmp["lang"])
+                    else:
+                        # 因无法区别中日韩文汉字,以用户输入为准
+                        langlist.append(language)
+                    textlist.append(tmp["text"])
+            # print(textlist)
+            # print(langlist)
+            phones_list = []
+            bert_list = []
+            norm_text_list = []
+            for i in range(len(textlist)):
+                lang = langlist[i]
+                phones, word2ph, norm_text = self.clean_text_inf(textlist[i], lang, version)
+                bert = self.get_bert_inf(phones, word2ph, norm_text, lang)
+                phones_list.append(phones)
+                norm_text_list.append(norm_text)
+                bert_list.append(bert)
+            bert = torch.cat(bert_list, dim=1)
+            phones = sum(phones_list, [])
+            norm_text = ''.join(norm_text_list)
+
+        if not final and len(phones) < 6:
+            return self.get_phones_and_bert("." + text,language,version,final=True)
+
+        return phones, bert, norm_text
+
+
+    def get_bert_feature(self, text:str, word2ph:list)->torch.Tensor:
+        with torch.no_grad():
+            inputs = self.tokenizer(text, return_tensors="pt")
+            for i in inputs:
+                inputs[i] = inputs[i].to(self.device)
+            res = self.bert_model(**inputs, output_hidden_states=True)
+            res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()[1:-1]
+        assert len(word2ph) == len(text)
+        phone_level_feature = []
+        for i in range(len(word2ph)):
+            repeat_feature = res[i].repeat(word2ph[i], 1)
+            phone_level_feature.append(repeat_feature)
+        phone_level_feature = torch.cat(phone_level_feature, dim=0)
+        return phone_level_feature.T
+    
+    def clean_text_inf(self, text:str, language:str, version:str="v1"):
+        phones, word2ph, norm_text = clean_text(text, language, version)
+        phones = cleaned_text_to_sequence(phones, version)
+        return phones, word2ph, norm_text
+
+    def get_bert_inf(self, phones:list, word2ph:list, norm_text:str, language:str):
+        language=language.replace("all_","")
+        if language == "zh":
+            feature = self.get_bert_feature(norm_text, word2ph).to(self.device)
+        else:
+            feature = torch.zeros(
+                (1024, len(phones)),
+                dtype=torch.float32,
+            ).to(self.device)
+
+        return feature
+
+
+    def filter_text(self,texts):
+        _text=[]
+        if all(text in [None, " ", "\n",""] for text in texts):
+            raise ValueError(i18n("请输入有效文本"))
+        for text in texts:
+            if text in  [None, " ", ""]:
+                pass
+            else:
+                _text.append(text)
+        return _text
+    
+
+    def replace_consecutive_punctuation(self,text):
+        punctuations = ''.join(re.escape(p) for p in punctuation)
+        pattern = f'([{punctuations}])([{punctuations}])+'
+        result = re.sub(pattern, r'\1', text)
+        return result
+
+
+
--- a/GPT_SoVITS/TTS_infer_pack/init.py
+++ b/GPT_SoVITS/TTS_infer_pack/init.py
@@ -0,0 +1 @@
+from . import TTS, text_segmentation_method
--- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
+++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py
@@ -0,0 +1,173 @@
+
+
+
+
+import re
+from typing import Callable
+
+punctuation = set(['!', '?', '…', ',', '.', '-'," "])
+METHODS = dict()
+
+def get_method(name:str)->Callable:
+    method = METHODS.get(name, None)
+    if method is None:
+        raise ValueError(f"Method {name} not found")
+    return method
+
+def get_method_names()->list:
+    return list(METHODS.keys())
+
+def register_method(name):
+    def decorator(func):
+        METHODS[name] = func
+        return func
+    return decorator
+
+splits = {"，", "。", "？", "！", ",", ".", "?", "!", "~", ":", "：", "—", "…", }
+
+def split_big_text(text, max_len=510):
+    # 定义全角和半角标点符号
+    punctuation = "".join(splits)
+
+    # 切割文本
+    segments = re.split('([' + punctuation + '])', text)
+    
+    # 初始化结果列表和当前片段
+    result = []
+    current_segment = ''
+    
+    for segment in segments:
+        # 如果当前片段加上新的片段长度超过max_len，就将当前片段加入结果列表，并重置当前片段
+        if len(current_segment + segment) > max_len:
+            result.append(current_segment)
+            current_segment = segment
+        else:
+            current_segment += segment
+    
+    # 将最后一个片段加入结果列表
+    if current_segment:
+        result.append(current_segment)
+    
+    return result
+
+
+
+def split(todo_text):
+    todo_text = todo_text.replace("……", "。").replace("——", "，")
+    if todo_text[-1] not in splits:
+        todo_text += "。"
+    i_split_head = i_split_tail = 0
+    len_text = len(todo_text)
+    todo_texts = []
+    while 1:
+        if i_split_head >= len_text:
+            break  # 结尾一定有标点，所以直接跳出即可，最后一段在上次已加入
+        if todo_text[i_split_head] in splits:
+            i_split_head += 1
+            todo_texts.append(todo_text[i_split_tail:i_split_head])
+            i_split_tail = i_split_head
+        else:
+            i_split_head += 1
+    return todo_texts
+
+
+# 不切
+@register_method("cut0")
+def cut0(inp):
+    if not set(inp).issubset(punctuation):
+        return inp
+    else:
+        return "/n"
+
+
+# 凑四句一切
+@register_method("cut1")
+def cut1(inp):
+    inp = inp.strip("\n")
+    inps = split(inp)
+    split_idx = list(range(0, len(inps), 4))
+    split_idx[-1] = None
+    if len(split_idx) > 1:
+        opts = []
+        for idx in range(len(split_idx) - 1):
+            opts.append("".join(inps[split_idx[idx]: split_idx[idx + 1]]))
+    else:
+        opts = [inp]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+
+
+# 凑50字一切
+@register_method("cut2")
+def cut2(inp):
+    inp = inp.strip("\n")
+    inps = split(inp)
+    if len(inps) < 2:
+        return inp
+    opts = []
+    summ = 0
+    tmp_str = ""
+    for i in range(len(inps)):
+        summ += len(inps[i])
+        tmp_str += inps[i]
+        if summ > 50:
+            summ = 0
+            opts.append(tmp_str)
+            tmp_str = ""
+    if tmp_str != "":
+        opts.append(tmp_str)
+    # print(opts)
+    if len(opts) > 1 and len(opts[-1]) < 50:  ##如果最后一个太短了，和前一个合一起
+        opts[-2] = opts[-2] + opts[-1]
+        opts = opts[:-1]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+
+# 按中文句号。切
+@register_method("cut3")
+def cut3(inp):
+    inp = inp.strip("\n")
+    opts = ["%s" % item for item in inp.strip("。").split("。")]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+
+#按英文句号.切
+@register_method("cut4")
+def cut4(inp):
+    inp = inp.strip("\n")
+    opts = ["%s" % item for item in inp.strip(".").split(".")]
+    opts = [item for item in opts if not set(item).issubset(punctuation)]
+    return "\n".join(opts)
+
+# 按标点符号切
+# contributed by https://github.com/AI-Hobbyist/GPT-SoVITS/blob/main/GPT_SoVITS/inference_webui.py
+@register_method("cut5")
+def cut5(inp):
+    inp = inp.strip("\n")
+    punds = {',', '.', ';', '?', '!', '、', '，', '。', '？', '！', ';', '：', '…'}
+    mergeitems = []
+    items = []
+
+    for i, char in enumerate(inp):
+        if char in punds:
+            if char == '.' and i > 0 and i < len(inp) - 1 and inp[i - 1].isdigit() and inp[i + 1].isdigit():
+                items.append(char)
+            else:
+                items.append(char)
+                mergeitems.append("".join(items))
+                items = []
+        else:
+            items.append(char)
+
+    if items:
+        mergeitems.append("".join(items))
+
+    opt = [item for item in mergeitems if not set(item).issubset(punds)]
+    return "\n".join(opt)
+
+
+
+if __name__ == '__main__':
+    method = get_method("cut5")
+    print(method("你好，我是小明。你好，我是小红。你好，我是小刚。你好，我是小张。"))
+
				`@@ -0,0 +1 @@`
				`from . import TTS, text_segmentation_method`