more code refactor

This commit is contained in:
Blaise
2024-01-16 17:14:18 +01:00
parent 0d92575115
commit 0d3d47f3c3
44 changed files with 4516 additions and 2623 deletions

View File

@@ -8,57 +8,63 @@ from text import symbols
# Regular expression matching Japanese without punctuation marks:
_japanese_characters = re.compile(
r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
# Regular expression matching non-Japanese characters or punctuation marks:
_japanese_marks = re.compile(
r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
)
# List of (symbol, Japanese) pairs for marks:
_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
('', 'パーセント')
]]
_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("", "パーセント")]]
# List of (consonant, sokuon) pairs:
_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
(r'Q([↑↓]*[kg])', r'k#\1'),
(r'Q([↑↓]*[tdjʧ])', r't#\1'),
(r'Q([↑↓]*[])', r's\1'),
(r'Q([↑↓]*[pb])', r'p#\1')
]]
_real_sokuon = [
(re.compile("%s" % x[0]), x[1])
for x in [
(r"Q([↑↓]*[kg])", r"k#\1"),
(r"Q([↑↓]*[tdjʧ])", r"t#\1"),
(r"Q([↑↓]*[sʃ])", r"s\1"),
(r"Q([↑↓]*[pb])", r"p#\1"),
]
]
# List of (consonant, hatsuon) pairs:
_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
(r'N([↑↓]*[pbm])', r'm\1'),
(r'N([↑↓]*[ʧʥj])', r'n^\1'),
(r'N([↑↓]*[tdn])', r'n\1'),
(r'N([↑↓]*[kg])', r'ŋ\1')
]]
_real_hatsuon = [
(re.compile("%s" % x[0]), x[1])
for x in [
(r"N([↑↓]*[pbm])", r"m\1"),
(r"N([↑↓]*[ʧʥj])", r"n^\1"),
(r"N([↑↓]*[tdn])", r"n\1"),
(r"N([↑↓]*[kg])", r"ŋ\1"),
]
]
def post_replace_ph(ph):
rep_map = {
'': ',',
'': ',',
'': ',',
'': '.',
'': '!',
'': '?',
'\n': '.',
"": ",",
"": ",",
"": ",",
"": ".",
"": "!",
"": "?",
"\n": ".",
"·": ",",
'': ",",
'...': ''
"": ",",
"...": "",
}
if ph in rep_map.keys():
ph = rep_map[ph]
if ph in symbols:
return ph
if ph not in symbols:
ph = 'UNK'
ph = "UNK"
return ph
def symbols_to_japanese(text):
for regex, replacement in _symbols_to_japanese:
text = re.sub(regex, replacement, text)
@@ -66,7 +72,7 @@ def symbols_to_japanese(text):
def preprocess_jap(text):
'''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
text = symbols_to_japanese(text)
sentences = re.split(_japanese_marks, text)
marks = re.findall(_japanese_marks, text)
@@ -77,13 +83,15 @@ def preprocess_jap(text):
text += p.split(" ")
if i < len(marks):
text += [marks[i].replace(' ', '')]
text += [marks[i].replace(" ", "")]
return text
def text_normalize(text):
# todo: jap text normalize
return text
def g2p(norm_text):
phones = preprocess_jap(norm_text)
phones = [post_replace_ph(i) for i in phones]
@@ -91,7 +99,7 @@ def g2p(norm_text):
return phones
if __name__ == '__main__':
if __name__ == "__main__":
for line in open("../../../Downloads/transcript_utf8.txt").readlines():
text = line.split(":")[1]
phones = g2p(text)