Add files via upload

2024-01-16 17:38:48 +08:00
parent 143d32f621
commit 41ca6028d6
65 changed files with 139856 additions and 0 deletions
--- a/GPT_SoVITS/text/chinese.py
+++ b/GPT_SoVITS/text/chinese.py
@@ -0,0 +1,167 @@
+import os
+import pdb
+import re
+
+import cn2an
+from pypinyin import lazy_pinyin, Style
+
+import sys
+sys.path.append("/data/docker/liujing04/gpt-vits/gpt-vits-master")
+
+from text.symbols import punctuation
+from text.tone_sandhi import ToneSandhi
+
+current_file_path = os.path.dirname(__file__)
+pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
+                        open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
+
+import jieba.posseg as psg
+
+
+rep_map = {
+    '：': ',',
+    '；': ',',
+    '，': ',',
+    '。': '.',
+    '！': '!',
+    '？': '?',
+    '\n': '.',
+    "·": ",",
+    '、': ",",
+    '...': '…',
+    '$': '.',
+    '/': ',',
+    '—': "-"
+}
+
+tone_modifier = ToneSandhi()
+
+def replace_punctuation(text):
+    text = text.replace("嗯", "恩").replace("呣","母")
+    pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
+
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+
+    replaced_text = re.sub(r'[^\u4e00-\u9fa5'+"".join(punctuation)+r']+', '', replaced_text)
+
+    return replaced_text
+
+def g2p(text):
+    pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
+    sentences = [i for i in re.split(pattern, text) if i.strip()!='']
+    phones, word2ph = _g2p(sentences)
+    return phones, word2ph
+
+
+def _get_initials_finals(word):
+    initials = []
+    finals = []
+    orig_initials = lazy_pinyin(
+        word, neutral_tone_with_five=True, style=Style.INITIALS)
+    orig_finals = lazy_pinyin(
+        word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+    for c, v in zip(orig_initials, orig_finals):
+        initials.append(c)
+        finals.append(v)
+    return initials, finals
+
+
+def _g2p(segments):
+    phones_list = []
+    word2ph = []
+    for seg in segments:
+        pinyins = []
+        # Replace all English words in the sentence
+        seg = re.sub('[a-zA-Z]+', '', seg)
+        seg_cut = psg.lcut(seg)
+        initials = []
+        finals = []
+        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
+        for word, pos in seg_cut:
+            if pos == 'eng':
+                continue
+            sub_initials, sub_finals = _get_initials_finals(word)
+            sub_finals = tone_modifier.modified_tone(word, pos,
+                                                          sub_finals)
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+
+            # assert len(sub_initials) == len(sub_finals) == len(word)
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+        #
+        for c, v in zip(initials, finals):
+            raw_pinyin = c+v
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c == v:
+                assert c in punctuation
+                phone = [c]
+                word2ph.append(1)
+            else:
+                v_without_tone = v[:-1]
+                tone = v[-1]
+
+                pinyin = c+v_without_tone
+                assert tone in '12345'
+
+                if c:
+                    # 多音节
+                    v_rep_map = {
+                        "uei": 'ui',
+                        'iou': 'iu',
+                        'uen': 'un',
+                    }
+                    if v_without_tone in v_rep_map.keys():
+                        pinyin = c+v_rep_map[v_without_tone]
+                else:
+                    # 单音节
+                    pinyin_rep_map = {
+                        'ing': 'ying',
+                        'i': 'yi',
+                        'in': 'yin',
+                        'u': 'wu',
+                    }
+                    if pinyin in pinyin_rep_map.keys():
+                        pinyin = pinyin_rep_map[pinyin]
+                    else:
+                        single_rep_map = {
+                            'v': 'yu',
+                            'e': 'e',
+                            'i': 'y',
+                            'u': 'w',
+                        }
+                        if pinyin[0] in single_rep_map.keys():
+                            pinyin = single_rep_map[pinyin[0]]+pinyin[1:]
+
+                assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
+                new_c, new_v = pinyin_to_symbol_map[pinyin].split(' ')
+                new_v = new_v + tone
+                phone = [new_c, new_v]
+                word2ph.append(len(phone))
+
+            phones_list += phone
+    return phones_list, word2ph
+
+
+
+def text_normalize(text):
+    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    for number in numbers:
+        text = text.replace(number, cn2an.an2cn(number), 1)
+    text = replace_punctuation(text)
+
+    return text
+
+
+if __name__ == '__main__':
+    text = "啊——但是《原神》是由,米哈\游自主，研发的一款全.新开放世界.冒险游戏"
+    text = "呣呣呣～就是…大人的鼹鼠党吧？"
+    text = "你好"
+    text = text_normalize(text)
+    print(g2p(text))
+
+
+# # 示例用法
+# text = "这是一个示例文本：,你好！这是一个测试..."
+# print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试