Add files via upload

2024-01-16 17:38:48 +08:00
parent 143d32f621
commit 41ca6028d6
65 changed files with 139856 additions and 0 deletions
@@ -0,0 +1,80 @@
+# modified from https://github.com/feng-yufei/shared_debugging_code/blob/main/text_processing/phonemizer.py
+import itertools
+import re
+from typing import Dict
+from typing import List
+
+import regex
+from gruut import sentences
+from gruut.const import Sentence
+from gruut.const import Word
+from AR.text_processing.symbols import SYMBOL_TO_ID
+
+
+class GruutPhonemizer:
+    def __init__(self, language: str):
+        self._phonemizer = sentences
+        self.lang = language
+        self.symbol_to_id = SYMBOL_TO_ID
+        self._special_cases_dict: Dict[str] = {
+            r"\.\.\.": "... ",
+            ";": "; ",
+            ":": ": ",
+            ",": ", ",
+            r"\.": ". ",
+            "!": "! ",
+            r"\?": "? ",
+            "—": "—",
+            "…": "… ",
+            "«": "«",
+            "»": "»"
+        }
+        self._punctuation_regexp: str = rf"([{''.join(self._special_cases_dict.keys())}])"
+
+    def _normalize_punctuation(self, text: str) -> str:
+        text = regex.sub(fr"\pZ+{self._punctuation_regexp}", r"\1", text)
+        text = regex.sub(fr"{self._punctuation_regexp}(\pL)", r"\1 \2", text)
+        text = regex.sub(r"\pZ+", r" ", text)
+        return text.strip()
+
+    def _convert_punctuation(self, word: Word) -> str:
+        if not word.phonemes:
+            return ''
+        if word.phonemes[0] in ['‖', '|']:
+            return word.text.strip()
+
+        phonemes = ''.join(word.phonemes)
+        # remove modifier characters ˈˌː with regex
+        phonemes = re.sub(r'[ˈˌː͡]', '', phonemes)
+        return phonemes.strip()
+
+    def phonemize(self, text: str, espeak: bool=False) -> str:
+        text_to_phonemize: str = self._normalize_punctuation(text)
+        sents: List[Sentence] = [
+            sent
+            for sent in self._phonemizer(
+                text_to_phonemize, lang="en-us", espeak=espeak)
+        ]
+        words: List[str] = [
+            self._convert_punctuation(word) for word in itertools.chain(*sents)
+        ]
+        return ' '.join(words)
+
+    def transform(self, phonemes):
+        # convert phonemes to ids
+        # dictionary is in symbols.py
+        return [
+            self.symbol_to_id[p] for p in phonemes
+            if p in self.symbol_to_id.keys()
+        ]
+
+
+if __name__ == "__main__":
+    phonemizer = GruutPhonemizer("en-us")
+    # text -> IPA
+    phonemes = phonemizer.phonemize("Hello, wor-ld ?")
+    print("phonemes:", phonemes)
+    print("len(phonemes):", len(phonemes))
+    phoneme_ids = phonemizer.transform(phonemes)
+    print("phoneme_ids:", phoneme_ids)
+    print("len(phoneme_ids):", len(phoneme_ids))