Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix

* ruff format --line-length 120 --target-version py39

* Change the link for G2PW Model

* update pytorch version and colab
This commit is contained in:
XXXXRT666
2025-04-07 09:42:47 +01:00
committed by GitHub
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions

View File

@@ -1 +1 @@
from .langsegmenter import LangSegmenter
from .langsegmenter import LangSegmenter

View File

@@ -3,38 +3,44 @@ import re
# jieba静音
import jieba
jieba.setLogLevel(logging.CRITICAL)
# 更改fast_langdetect大模型位置
from pathlib import Path
import fast_langdetect
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
fast_langdetect.infer.LangDetectConfig(
cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
)
)
from split_lang import LangSplitter
def full_en(text):
pattern = r'^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
pattern = r"^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
return bool(re.match(pattern, text))
def full_cjk(text):
# 来自wiki
cjk_ranges = [
(0x4E00, 0x9FFF), # CJK Unified Ideographs
(0x3400, 0x4DB5), # CJK Extension A
(0x20000, 0x2A6DD), # CJK Extension B
(0x2A700, 0x2B73F), # CJK Extension C
(0x2B740, 0x2B81F), # CJK Extension D
(0x2B820, 0x2CEAF), # CJK Extension E
(0x2CEB0, 0x2EBEF), # CJK Extension F
(0x30000, 0x3134A), # CJK Extension G
(0x31350, 0x323AF), # CJK Extension H
(0x2EBF0, 0x2EE5D), # CJK Extension H
(0x4E00, 0x9FFF), # CJK Unified Ideographs
(0x3400, 0x4DB5), # CJK Extension A
(0x20000, 0x2A6DD), # CJK Extension B
(0x2A700, 0x2B73F), # CJK Extension C
(0x2B740, 0x2B81F), # CJK Extension D
(0x2B820, 0x2CEAF), # CJK Extension E
(0x2CEB0, 0x2EBEF), # CJK Extension F
(0x30000, 0x3134A), # CJK Extension G
(0x31350, 0x323AF), # CJK Extension H
(0x2EBF0, 0x2EE5D), # CJK Extension H
]
pattern = r'[0-9、-〜。!?.!?… ]+$'
pattern = r"[0-9、-〜。!?.!?… ]+$"
cjk_text = ""
for char in text:
@@ -45,7 +51,7 @@ def full_cjk(text):
return cjk_text
def split_jako(tag_lang,item):
def split_jako(tag_lang, item):
if tag_lang == "ja":
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
else:
@@ -53,41 +59,40 @@ def split_jako(tag_lang,item):
lang_list: list[dict] = []
tag = 0
for match in re.finditer(pattern, item['text']):
for match in re.finditer(pattern, item["text"]):
if match.start() > tag:
lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
tag = match.end()
lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
if tag < len(item['text']):
lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
if tag < len(item["text"]):
lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
return lang_list
def merge_lang(lang_list, item):
if lang_list and item['lang'] == lang_list[-1]['lang']:
lang_list[-1]['text'] += item['text']
if lang_list and item["lang"] == lang_list[-1]["lang"]:
lang_list[-1]["text"] += item["text"]
else:
lang_list.append(item)
return lang_list
class LangSegmenter():
class LangSegmenter:
# 默认过滤器, 基于gsv目前四种语言
DEFAULT_LANG_MAP = {
"zh": "zh",
"yue": "zh", # 粤语
"wuu": "zh", # 吴语
"zh-cn": "zh",
"zh-tw": "x", # 繁体设置为x
"zh-tw": "x", # 繁体设置为x
"ko": "ko",
"ja": "ja",
"en": "en",
}
def getTexts(text):
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
substr = lang_splitter.split_by_lang(text=text)
@@ -95,18 +100,18 @@ class LangSegmenter():
lang_list: list[dict] = []
for _, item in enumerate(substr):
dict_item = {'lang':item.lang,'text':item.text}
dict_item = {"lang": item.lang, "text": item.text}
# 处理短英文被识别为其他语言的问题
if full_en(dict_item['text']):
dict_item['lang'] = 'en'
lang_list = merge_lang(lang_list,dict_item)
if full_en(dict_item["text"]):
dict_item["lang"] = "en"
lang_list = merge_lang(lang_list, dict_item)
continue
# 处理非日语夹日文的问题(不包含CJK)
ja_list: list[dict] = []
if dict_item['lang'] != 'ja':
ja_list = split_jako('ja',dict_item)
if dict_item["lang"] != "ja":
ja_list = split_jako("ja", dict_item)
if not ja_list:
ja_list.append(dict_item)
@@ -115,8 +120,8 @@ class LangSegmenter():
ko_list: list[dict] = []
temp_list: list[dict] = []
for _, ko_item in enumerate(ja_list):
if ko_item["lang"] != 'ko':
ko_list = split_jako('ko',ko_item)
if ko_item["lang"] != "ko":
ko_list = split_jako("ko", ko_item)
if ko_list:
temp_list.extend(ko_list)
@@ -126,28 +131,28 @@ class LangSegmenter():
# 未存在非日韩文夹日韩文
if len(temp_list) == 1:
# 未知语言检查是否为CJK
if dict_item['lang'] == 'x':
cjk_text = full_cjk(dict_item['text'])
if dict_item["lang"] == "x":
cjk_text = full_cjk(dict_item["text"])
if cjk_text:
dict_item = {'lang':'zh','text':cjk_text}
lang_list = merge_lang(lang_list,dict_item)
dict_item = {"lang": "zh", "text": cjk_text}
lang_list = merge_lang(lang_list, dict_item)
continue
else:
lang_list = merge_lang(lang_list,dict_item)
lang_list = merge_lang(lang_list, dict_item)
continue
# 存在非日韩文夹日韩文
for _, temp_item in enumerate(temp_list):
# 未知语言检查是否为CJK
if temp_item['lang'] == 'x':
cjk_text = full_cjk(dict_item['text'])
if temp_item["lang"] == "x":
cjk_text = full_cjk(dict_item["text"])
if cjk_text:
dict_item = {'lang':'zh','text':cjk_text}
lang_list = merge_lang(lang_list,dict_item)
dict_item = {"lang": "zh", "text": cjk_text}
lang_list = merge_lang(lang_list, dict_item)
else:
lang_list = merge_lang(lang_list,temp_item)
lang_list = merge_lang(lang_list, temp_item)
return lang_list
if __name__ == "__main__":
text = "MyGO?,你也喜欢まいご吗?"
@@ -155,4 +160,3 @@ if __name__ == "__main__":
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
print(LangSegmenter.getTexts(text))

View File

@@ -10,18 +10,19 @@ from text import symbols2 as symbols_v2
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
def cleaned_text_to_sequence(cleaned_text, version=None):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
'''
if version is None:version=os.environ.get('version', 'v2')
if version == "v1":
phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
else:
phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
return phones
"""
if version is None:
version = os.environ.get("version", "v2")
if version == "v1":
phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
else:
phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
return phones

View File

@@ -1,6 +1,5 @@
# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py
import sys
import re
import cn2an
import ToJyutping
@@ -99,9 +98,7 @@ def replace_punctuation(text):
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
)
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
return replaced_text
@@ -115,7 +112,9 @@ def text_normalize(text):
return dest_text
punctuation_set=set(punctuation)
punctuation_set = set(punctuation)
def jyuping_to_initials_finals_tones(jyuping_syllables):
initials_finals = []
tones = []
@@ -160,12 +159,14 @@ def jyuping_to_initials_finals_tones(jyuping_syllables):
assert len(initials_finals) == len(tones)
###魔改为辅音+带音调的元音
phones=[]
for a,b in zip(initials_finals,tones):
if(b not in [-1,0]):###防止粤语和普通话重合开头加Y如果是标点不加。
todo="%s%s"%(a,b)
else:todo=a
if(todo not in punctuation_set):todo="Y%s"%todo
phones = []
for a, b in zip(initials_finals, tones):
if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y如果是标点不加。
todo = "%s%s" % (a, b)
else:
todo = a
if todo not in punctuation_set:
todo = "Y%s" % todo
phones.append(todo)
# return initials_finals, tones, word2ph
@@ -218,4 +219,4 @@ if __name__ == "__main__":
# phones, tones, word2ph = g2p(text)
phones, word2ph = g2p(text)
# print(phones, tones, word2ph)
print(phones, word2ph)
print(phones, word2ph)

View File

@@ -1,5 +1,4 @@
import os
import pdb
import re
import cn2an
@@ -17,7 +16,9 @@ pinyin_to_symbol_map = {
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
import jieba_fast, logging
import jieba_fast
import logging
jieba_fast.setLogLevel(logging.CRITICAL)
import jieba_fast.posseg as psg
@@ -37,7 +38,7 @@ rep_map = {
"/": ",",
"": "-",
"~": "",
"":"",
"": "",
}
tone_modifier = ToneSandhi()
@@ -49,9 +50,7 @@ def replace_punctuation(text):
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
)
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
return replaced_text
@@ -62,17 +61,15 @@ def replace_punctuation_with_en(text):
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
)
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
return replaced_text
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
punctuations = "".join(re.escape(p) for p in punctuation)
pattern = f"([{punctuations}])([{punctuations}])+"
result = re.sub(pattern, r"\1", text)
return result
@@ -87,9 +84,7 @@ def _get_initials_finals(word):
initials = []
finals = []
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
)
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
for c, v in zip(orig_initials, orig_finals):
initials.append(c)
finals.append(v)

View File

@@ -1,10 +1,9 @@
import os
import pdb
import re
import cn2an
from pypinyin import lazy_pinyin, Style
from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals
from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials
from text.symbols import punctuation
from text.tone_sandhi import ToneSandhi
@@ -18,18 +17,26 @@ pinyin_to_symbol_map = {
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
}
import jieba_fast, logging
import jieba_fast
import logging
jieba_fast.setLogLevel(logging.CRITICAL)
import jieba_fast.posseg as psg
# is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启
# is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
is_g2pw = True # True if is_g2pw_str.lower() == 'true' else False
if is_g2pw:
# print("当前使用g2pw进行拼音推理")
from text.g2pw import G2PWPinyin, correct_pronunciation
parent_directory = os.path.dirname(current_file_path)
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
g2pw = G2PWPinyin(
model_dir="GPT_SoVITS/text/G2PWModel",
model_source=os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),
v_to_u=False,
neutral_tone_with_five=True,
)
rep_map = {
"": ",",
@@ -46,7 +53,7 @@ rep_map = {
"/": ",",
"": "-",
"~": "",
"":"",
"": "",
}
tone_modifier = ToneSandhi()
@@ -58,9 +65,7 @@ def replace_punctuation(text):
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
)
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
return replaced_text
@@ -77,9 +82,7 @@ def _get_initials_finals(word):
finals = []
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
orig_finals = lazy_pinyin(
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
)
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
for c, v in zip(orig_initials, orig_finals):
initials.append(c)
@@ -87,31 +90,66 @@ def _get_initials_finals(word):
return initials, finals
must_erhua = {
"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
}
must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"}
not_erhua = {
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫",
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "",
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "",
"", "少儿"
"虐儿",
"",
"",
"",
"",
"替儿",
"有儿",
"一儿",
"我儿",
"俺儿",
"妻儿",
"拐儿",
"聋儿",
"乞儿",
"患儿",
"幼儿",
"孤儿",
"婴儿",
"婴幼儿",
"连体儿",
"脑瘫儿",
"流浪儿",
"体弱儿",
"混血儿",
"蜜雪儿",
"舫儿",
"祖儿",
"美儿",
"应采儿",
"可儿",
"侄儿",
"孙儿",
"侄孙儿",
"女儿",
"男儿",
"红孩儿",
"花儿",
"虫儿",
"马儿",
"鸟儿",
"猪儿",
"猫儿",
"狗儿",
"少儿",
}
def _merge_erhua(initials: list[str],
finals: list[str],
word: str,
pos: str) -> list[list[str]]:
def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) -> list[list[str]]:
"""
Do erhub.
"""
# fix er1
for i, phn in enumerate(finals):
if i == len(finals) - 1 and word[i] == "" and phn == 'er1':
finals[i] = 'er2'
if i == len(finals) - 1 and word[i] == "" and phn == "er1":
finals[i] = "er2"
# 发音
if word not in must_erhua and (word in not_erhua or
pos in {"a", "j", "nr"}):
if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
return initials, finals
# "……" 等情况直接返回
@@ -124,9 +162,13 @@ def _merge_erhua(initials: list[str],
new_initials = []
new_finals = []
for i, phn in enumerate(finals):
if i == len(finals) - 1 and word[i] == "" and phn in {
"er2", "er5"
} and word[-2:] not in not_erhua and new_finals:
if (
i == len(finals) - 1
and word[i] == ""
and phn in {"er2", "er5"}
and word[-2:] not in not_erhua
and new_finals
):
phn = "er" + new_finals[-1][-1]
new_initials.append(initials[i])
@@ -160,7 +202,7 @@ def _g2p(segments):
# assert len(sub_initials) == len(sub_finals) == len(word)
initials = sum(initials, [])
finals = sum(finals, [])
print("pypinyin结果",initials,finals)
print("pypinyin结果", initials, finals)
else:
# g2pw采用整句推理
pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
@@ -171,19 +213,19 @@ def _g2p(segments):
sub_finals = []
now_word_length = pre_word_length + len(word)
if pos == 'eng':
if pos == "eng":
pre_word_length = now_word_length
continue
word_pinyins = pinyins[pre_word_length:now_word_length]
# 多音字消歧
word_pinyins = correct_pronunciation(word,word_pinyins)
word_pinyins = correct_pronunciation(word, word_pinyins)
for pinyin in word_pinyins:
if pinyin[0].isalpha():
sub_initials.append(to_initials(pinyin))
sub_finals.append(to_finals_tone3(pinyin,neutral_tone_with_five=True))
sub_finals.append(to_finals_tone3(pinyin, neutral_tone_with_five=True))
else:
sub_initials.append(pinyin)
sub_finals.append(pinyin)
@@ -259,18 +301,18 @@ def replace_punctuation_with_en(text):
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = re.sub(
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
)
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
return replaced_text
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
punctuations = "".join(re.escape(p) for p in punctuation)
pattern = f"([{punctuations}])([{punctuations}])+"
result = re.sub(pattern, r"\1", text)
return result
def text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
@@ -283,6 +325,7 @@ def text_normalize(text):
dest_text = replace_consecutive_punctuation(dest_text)
return dest_text
# 不排除英文的文本格式化
def mix_text_normalize(text):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization

View File

@@ -19,55 +19,57 @@ special = [
def clean_text(text, language, version=None):
if version is None:version=os.environ.get('version', 'v2')
if version is None:
version = os.environ.get("version", "v2")
if version == "v1":
symbols = symbols_v1.symbols
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
else:
symbols = symbols_v2.symbols
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"}
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
if(language not in language_module_map):
language="en"
text=" "
if language not in language_module_map:
language = "en"
text = " "
for special_s, special_l, target_symbol in special:
if special_s in text and language == special_l:
return clean_special(text, language, special_s, target_symbol, version)
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]])
if hasattr(language_module,"text_normalize"):
language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
if hasattr(language_module, "text_normalize"):
norm_text = language_module.text_normalize(text)
else:
norm_text=text
if language == "zh" or language=="yue":##########
norm_text = text
if language == "zh" or language == "yue": ##########
phones, word2ph = language_module.g2p(norm_text)
assert len(phones) == sum(word2ph)
assert len(norm_text) == len(word2ph)
elif language == "en":
phones = language_module.g2p(norm_text)
if len(phones) < 4:
phones = [','] + phones
phones = [","] + phones
word2ph = None
else:
phones = language_module.g2p(norm_text)
word2ph = None
phones = ['UNK' if ph not in symbols else ph for ph in phones]
phones = ["UNK" if ph not in symbols else ph for ph in phones]
return phones, word2ph, norm_text
def clean_special(text, language, special_s, target_symbol, version=None):
if version is None:version=os.environ.get('version', 'v2')
if version is None:
version = os.environ.get("version", "v2")
if version == "v1":
symbols = symbols_v1.symbols
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
else:
symbols = symbols_v2.symbols
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"}
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
"""
特殊静音段sp符号处理
"""
text = text.replace(special_s, ",")
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]])
language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
norm_text = language_module.text_normalize(text)
phones = language_module.g2p(norm_text)
new_ph = []
@@ -81,8 +83,9 @@ def clean_special(text, language, special_s, target_symbol, version=None):
def text_to_sequence(text, language, version=None):
version = os.environ.get('version',version)
if version is None:version='v2'
version = os.environ.get("version", version)
if version is None:
version = "v2"
phones = clean_text(text)
return cleaned_text_to_sequence(phones, version)

View File

@@ -9,17 +9,17 @@ import unicodedata
# 后缀计量单位替换表
measurement_map = {
"m": ["meter", "meters"],
'km': ["kilometer", "kilometers"],
"km": ["kilometer", "kilometers"],
"km/h": ["kilometer per hour", "kilometers per hour"],
"ft": ["feet", "feet"],
"L": ["liter", "liters"],
"tbsp": ["tablespoon", "tablespoons"],
'tsp': ["teaspoon", "teaspoons"],
"tsp": ["teaspoon", "teaspoons"],
"h": ["hour", "hours"],
"min": ["minute", "minutes"],
"s": ["second", "seconds"],
"°C": ["degree celsius", "degrees celsius"],
"°F": ["degree fahrenheit", "degrees fahrenheit"]
"°F": ["degree fahrenheit", "degrees fahrenheit"],
}
@@ -27,41 +27,42 @@ measurement_map = {
_inflect = inflect.engine()
# 转化数字序数词
_ordinal_number_re = re.compile(r'\b([0-9]+)\. ')
_ordinal_number_re = re.compile(r"\b([0-9]+)\. ")
# 我听说好像对于数字正则识别其实用 \d 会好一点
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
# 时间识别
_time_re = re.compile(r'\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b')
_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b")
# 后缀计量单位识别
_measurement_re = re.compile(r'\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b')
_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b")
# 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ )
_pounds_re_start = re.compile(r'£([0-9\.\,]*[0-9]+)')
_pounds_re_end = re.compile(r'([0-9\.\,]*[0-9]+)£')
_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)")
_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£")
# 前后 $ 识别
_dollars_re_start = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_dollars_re_end = re.compile(r'([(0-9\.\,]*[0-9]+)\$')
_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)")
_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$")
# 小数的识别
_decimal_number_re = re.compile(r'([0-9]+\.\s*[0-9]+)')
_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)")
# 分数识别 (形式 "3/4" )
_fraction_re = re.compile(r'([0-9]+/[0-9]+)')
_fraction_re = re.compile(r"([0-9]+/[0-9]+)")
# 序数词识别
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
# 数字处理
_number_re = re.compile(r'[0-9]+')
_number_re = re.compile(r"[0-9]+")
def _convert_ordinal(m):
"""
标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
Examples:
input: "1. "
output: "1st"
@@ -70,24 +71,26 @@ def _convert_ordinal(m):
ordinal = _inflect.ordinal(m.group(1))
return ordinal + ", "
def _remove_commas(m):
return m.group(1).replace(',', '')
return m.group(1).replace(",", "")
def _expand_time(m):
"""
将 24 小时制的时间转换为 12 小时制的时间表示方式。
Examples:
input: "13:00 / 4:00 / 13:30"
output: "one o'clock p.m. / four o'clock am. / one thirty p.m."
"""
hours, minutes = map(int, m.group(1, 2))
period = 'a.m.' if hours < 12 else 'p.m.'
period = "a.m." if hours < 12 else "p.m."
if hours > 12:
hours -= 12
hour_word = _inflect.number_to_words(hours)
minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ''
minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ""
if minutes == 0:
return f"{hour_word} o'clock {period}"
@@ -103,7 +106,7 @@ def _expand_measurement(m):
sign = m.group(3)
ptr = 1
# 想不到怎么方便的取数字又懒得改正则1.2 反正也是复数读法,干脆直接去掉 "."
num = int(m.group(1).replace(sign, '').replace(".",''))
num = int(m.group(1).replace(sign, "").replace(".", ""))
decimal_part = m.group(2)
# 上面判断的漏洞,比如 0.1 的情况,在这里排除了
if decimal_part == None and num == 1:
@@ -116,23 +119,24 @@ def _expand_pounds(m):
没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起
"""
match = m.group(1)
parts = match.split('.')
parts = match.split(".")
if len(parts) > 2:
return match + ' pounds' # Unexpected format
return match + " pounds" # Unexpected format
pounds = int(parts[0]) if parts[0] else 0
pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
if pounds and pence:
pound_unit = 'pound' if pounds == 1 else 'pounds'
penny_unit = 'penny' if pence == 1 else 'pence'
return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit)
pound_unit = "pound" if pounds == 1 else "pounds"
penny_unit = "penny" if pence == 1 else "pence"
return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit)
elif pounds:
pound_unit = 'pound' if pounds == 1 else 'pounds'
return '%s %s' % (pounds, pound_unit)
pound_unit = "pound" if pounds == 1 else "pounds"
return "%s %s" % (pounds, pound_unit)
elif pence:
penny_unit = 'penny' if pence == 1 else 'pence'
return '%s %s' % (pence, penny_unit)
penny_unit = "penny" if pence == 1 else "pence"
return "%s %s" % (pence, penny_unit)
else:
return 'zero pounds'
return "zero pounds"
def _expand_dollars(m):
"""
@@ -142,23 +146,24 @@ def _expand_dollars(m):
output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents"
"""
match = m.group(1)
parts = match.split('.')
parts = match.split(".")
if len(parts) > 2:
return match + ' dollars' # Unexpected format
return match + " dollars" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit)
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return 'zero dollars'
return "zero dollars"
# 小数的处理
def _expand_decimal_number(m):
@@ -168,11 +173,11 @@ def _expand_decimal_number(m):
output: "thirteen point two three four"
"""
match = m.group(1)
parts = match.split('.')
parts = match.split(".")
words = []
# 遍历字符串中的每个字符
for char in parts[1]:
if char == '.':
if char == ".":
words.append("point")
else:
words.append(char)
@@ -186,7 +191,7 @@ def _expend_fraction(m):
规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法.
规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves.
Examples:
| Written | Said |
|:---:|:---:|
| 1/3 | one third |
@@ -196,39 +201,41 @@ def _expend_fraction(m):
| 3/2 | three halves |
"""
match = m.group(0)
numerator, denominator = map(int, match.split('/'))
numerator, denominator = map(int, match.split("/"))
numerator_part = _inflect.number_to_words(numerator)
if denominator == 2:
if numerator == 1:
denominator_part = 'half'
denominator_part = "half"
else:
denominator_part = 'halves'
denominator_part = "halves"
elif denominator == 1:
return f'{numerator_part}'
return f"{numerator_part}"
else:
denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator))
if numerator > 1:
denominator_part += 's'
denominator_part += "s"
return f"{numerator_part} {denominator_part}"
return f'{numerator_part} {denominator_part}'
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return 'two thousand'
return "two thousand"
elif num > 2000 and num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
return "two thousand " + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
return _inflect.number_to_words(num // 100) + " hundred"
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
else:
return _inflect.number_to_words(num, andword='')
return _inflect.number_to_words(num, andword="")
def normalize(text):
@@ -238,7 +245,7 @@ def normalize(text):
"""
text = re.sub(_ordinal_number_re, _convert_ordinal, text)
text = re.sub(r'(?<!\d)-|-(?!\d)', ' minus ', text)
text = re.sub(r"(?<!\d)-|-(?!\d)", " minus ", text)
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_time_re, _expand_time, text)
text = re.sub(_measurement_re, _expand_measurement, text)
@@ -251,19 +258,20 @@ def normalize(text):
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = "".join(
char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn"
) # Strip accents
text = re.sub("%", " percent", text)
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
text = re.sub(r"(?i)i\.e\.", "that is", text)
text = re.sub(r"(?i)e\.g\.", "for example", text)
# 增加纯大写单词拆分
text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
text = re.sub(r"(?<!^)(?<![\s])([A-Z])", r" \1", text)
return text
if __name__ == '__main__':
if __name__ == "__main__":
# 我觉得其实可以把切分结果展示出来只读或者修改不影响传给TTS的实际text
# 然后让用户确认后再输入给 TTS可以让用户检查自己有没有不标准的输入
print(normalize("1. test ordinal number 1st"))
@@ -272,4 +280,4 @@ if __name__ == '__main__':
print(normalize("1st, 22nd"))
print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
print(normalize("a test of time 4:00, 13:00, 13:30"))
print(normalize("a test of temperature 4°F, 23°C, -19°C"))
print(normalize("a test of temperature 4°F, 23°C, -19°C"))

View File

@@ -8,10 +8,10 @@ from text.symbols import punctuation
from text.symbols2 import symbols
import unicodedata
from builtins import str as unicode
from text.en_normalization.expend import normalize
from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize
from nltk import pos_tag
@@ -122,9 +122,9 @@ def replace_phs(phs):
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}\s])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
punctuations = "".join(re.escape(p) for p in punctuation)
pattern = f"([{punctuations}\s])([{punctuations}])+"
result = re.sub(pattern, r"\1", text)
return result
@@ -183,6 +183,7 @@ def read_dict_new():
return g2p_dict
def hot_reload_hot(g2p_dict):
with open(CMU_DICT_HOT_PATH) as f:
line = f.readline()
@@ -259,9 +260,12 @@ class en_G2p(G2p):
del self.cmu[word.lower()]
# 修正多音字
self.homograph2features["read"] = (['R', 'IY1', 'D'], ['R', 'EH1', 'D'], 'VBP')
self.homograph2features["complex"] = (['K', 'AH0', 'M', 'P', 'L', 'EH1', 'K', 'S'], ['K', 'AA1', 'M', 'P', 'L', 'EH0', 'K', 'S'], 'JJ')
self.homograph2features["read"] = (["R", "IY1", "D"], ["R", "EH1", "D"], "VBP")
self.homograph2features["complex"] = (
["K", "AH0", "M", "P", "L", "EH1", "K", "S"],
["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
"JJ",
)
def __call__(self, text):
# tokenization
@@ -280,7 +284,7 @@ class en_G2p(G2p):
elif len(word) == 1:
# 单读 A 发音修正, 这里需要原格式 o_word 判断大写
if o_word == "A":
pron = ['EY1']
pron = ["EY1"]
else:
pron = self.cmu[word][0]
# g2p_en 原版多音字处理
@@ -289,7 +293,7 @@ class en_G2p(G2p):
if pos.startswith(pos1):
pron = pron1
# pos1比pos长仅出现在read
elif len(pos) < len(pos1) and pos == pos1[:len(pos)]:
elif len(pos) < len(pos1) and pos == pos1[: len(pos)]:
pron = pron1
else:
pron = pron2
@@ -302,7 +306,6 @@ class en_G2p(G2p):
return prons[:-1]
def qryword(self, o_word):
word = o_word.lower()
@@ -320,7 +323,7 @@ class en_G2p(G2p):
for w in word:
# 单读 A 发音修正, 此处不存在大写的情况
if w == "a":
phones.extend(['EY1'])
phones.extend(["EY1"])
elif not w.isalpha():
phones.extend([w])
else:
@@ -331,23 +334,23 @@ class en_G2p(G2p):
if re.match(r"^([a-z]+)('s)$", word):
phones = self.qryword(word[:-2])[:]
# P T K F TH HH 无声辅音结尾 's 发 ['S']
if phones[-1] in ['P', 'T', 'K', 'F', 'TH', 'HH']:
phones.extend(['S'])
if phones[-1] in ["P", "T", "K", "F", "TH", "HH"]:
phones.extend(["S"])
# S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
elif phones[-1] in ['S', 'Z', 'SH', 'ZH', 'CH', 'JH']:
phones.extend(['AH0', 'Z'])
elif phones[-1] in ["S", "Z", "SH", "ZH", "CH", "JH"]:
phones.extend(["AH0", "Z"])
# B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
# AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
# ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
else:
phones.extend(['Z'])
phones.extend(["Z"])
return phones
# 尝试进行分词,应对复合词
comps = wordsegment.segment(word.lower())
# 无法分词的送回去预测
if len(comps)==1:
if len(comps) == 1:
return self.predict(word)
# 可以分词的递归处理

View File

@@ -1 +1 @@
from text.g2pw.g2pw import *
from text.g2pw.g2pw import *

View File

@@ -15,6 +15,7 @@
Credits
This code is modified from https://github.com/GitYCC/g2pW
"""
from typing import Dict
from typing import List
from typing import Tuple
@@ -23,21 +24,24 @@ import numpy as np
from .utils import tokenize_and_map
ANCHOR_CHAR = ''
ANCHOR_CHAR = ""
def prepare_onnx_input(tokenizer,
labels: List[str],
char2phonemes: Dict[str, List[int]],
chars: List[str],
texts: List[str],
query_ids: List[int],
use_mask: bool=False,
window_size: int=None,
max_len: int=512) -> Dict[str, np.array]:
def prepare_onnx_input(
tokenizer,
labels: List[str],
char2phonemes: Dict[str, List[int]],
chars: List[str],
texts: List[str],
query_ids: List[int],
use_mask: bool = False,
window_size: int = None,
max_len: int = 512,
) -> Dict[str, np.array]:
if window_size is not None:
truncated_texts, truncated_query_ids = _truncate_texts(
window_size=window_size, texts=texts, query_ids=query_ids)
window_size=window_size, texts=texts, query_ids=query_ids
)
input_ids = []
token_type_ids = []
attention_masks = []
@@ -50,33 +54,27 @@ def prepare_onnx_input(tokenizer,
query_id = (truncated_query_ids if window_size else query_ids)[idx]
try:
tokens, text2token, token2text = tokenize_and_map(
tokenizer=tokenizer, text=text)
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
except Exception:
print(f'warning: text "{text}" is invalid')
return {}
text, query_id, tokens, text2token, token2text = _truncate(
max_len=max_len,
text=text,
query_id=query_id,
tokens=tokens,
text2token=text2token,
token2text=token2text)
max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
)
processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
input_id = list(
np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
token_type_id = list(np.zeros((len(processed_tokens), ), dtype=int))
attention_mask = list(np.ones((len(processed_tokens), ), dtype=int))
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
query_char = text[query_id]
phoneme_mask = [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] \
if use_mask else [1] * len(labels)
phoneme_mask = (
[1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
)
char_id = chars.index(query_char)
position_id = text2token[
query_id] + 1 # [CLS] token locate at first place
position_id = text2token[query_id] + 1 # [CLS] token locate at first place
input_ids.append(input_id)
token_type_ids.append(token_type_id)
@@ -86,18 +84,17 @@ def prepare_onnx_input(tokenizer,
position_ids.append(position_id)
outputs = {
'input_ids': np.array(input_ids).astype(np.int64),
'token_type_ids': np.array(token_type_ids).astype(np.int64),
'attention_masks': np.array(attention_masks).astype(np.int64),
'phoneme_masks': np.array(phoneme_masks).astype(np.float32),
'char_ids': np.array(char_ids).astype(np.int64),
'position_ids': np.array(position_ids).astype(np.int64),
"input_ids": np.array(input_ids).astype(np.int64),
"token_type_ids": np.array(token_type_ids).astype(np.int64),
"attention_masks": np.array(attention_masks).astype(np.int64),
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
"char_ids": np.array(char_ids).astype(np.int64),
"position_ids": np.array(position_ids).astype(np.int64),
}
return outputs
def _truncate_texts(window_size: int, texts: List[str],
query_ids: List[int]) -> Tuple[List[str], List[int]]:
def _truncate_texts(window_size: int, texts: List[str], query_ids: List[int]) -> Tuple[List[str], List[int]]:
truncated_texts = []
truncated_query_ids = []
for text, query_id in zip(texts, query_ids):
@@ -111,12 +108,9 @@ def _truncate_texts(window_size: int, texts: List[str],
return truncated_texts, truncated_query_ids
def _truncate(max_len: int,
text: str,
query_id: int,
tokens: List[str],
text2token: List[int],
token2text: List[Tuple[int]]):
def _truncate(
max_len: int, text: str, query_id: int, tokens: List[str], text2token: List[int], token2text: List[Tuple[int]]
):
truncate_len = max_len - 2
if len(tokens) <= truncate_len:
return (text, query_id, tokens, text2token, token2text)
@@ -137,14 +131,16 @@ def _truncate(max_len: int,
start = token2text[token_start][0]
end = token2text[token_end - 1][1]
return (text[start:end], query_id - start, tokens[token_start:token_end], [
i - token_start if i is not None else None
for i in text2token[start:end]
], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
return (
text[start:end],
query_id - start,
tokens[token_start:token_end],
[i - token_start if i is not None else None for i in text2token[start:end]],
[(s - start, e - start) for s, e in token2text[token_start:token_end]],
)
def get_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
char2phonemes = {}
for char, phoneme in polyphonic_chars:
@@ -154,13 +150,11 @@ def get_phoneme_labels(polyphonic_chars: List[List[str]]
return labels, char2phonemes
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(
list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
labels = sorted(list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars])))
char2phonemes = {}
for char, phoneme in polyphonic_chars:
if char not in char2phonemes:
char2phonemes[char] = []
char2phonemes[char].append(labels.index(f'{char} {phoneme}'))
char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
return labels, char2phonemes

View File

@@ -17,17 +17,25 @@ PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep")
class G2PWPinyin(Pinyin):
def __init__(self, model_dir='G2PWModel/', model_source=None,
enable_non_tradional_chinese=True,
v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs):
def __init__(
self,
model_dir="G2PWModel/",
model_source=None,
enable_non_tradional_chinese=True,
v_to_u=False,
neutral_tone_with_five=False,
tone_sandhi=False,
**kwargs,
):
self._g2pw = G2PWOnnxConverter(
model_dir=model_dir,
style='pinyin',
style="pinyin",
model_source=model_source,
enable_non_tradional_chinese=enable_non_tradional_chinese,
)
self._converter = Converter(
self._g2pw, v_to_u=v_to_u,
self._g2pw,
v_to_u=v_to_u,
neutral_tone_with_five=neutral_tone_with_five,
tone_sandhi=tone_sandhi,
)
@@ -37,31 +45,25 @@ class G2PWPinyin(Pinyin):
class Converter(UltimateConverter):
def __init__(self, g2pw_instance, v_to_u=False,
neutral_tone_with_five=False,
tone_sandhi=False, **kwargs):
def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs):
super(Converter, self).__init__(
v_to_u=v_to_u,
neutral_tone_with_five=neutral_tone_with_five,
tone_sandhi=tone_sandhi, **kwargs)
v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, **kwargs
)
self._g2pw = g2pw_instance
def convert(self, words, style, heteronym, errors, strict, **kwargs):
pys = []
if RE_HANS.match(words):
pys = self._to_pinyin(words, style=style, heteronym=heteronym,
errors=errors, strict=strict)
pys = self._to_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict)
post_data = self.post_pinyin(words, heteronym, pys)
if post_data is not None:
pys = post_data
pys = self.convert_styles(
pys, words, style, heteronym, errors, strict)
pys = self.convert_styles(pys, words, style, heteronym, errors, strict)
else:
py = self.handle_nopinyin(words, style=style, errors=errors,
heteronym=heteronym, strict=strict)
py = self.handle_nopinyin(words, style=style, errors=errors, heteronym=heteronym, strict=strict)
if py:
pys.extend(py)
@@ -73,13 +75,11 @@ class Converter(UltimateConverter):
g2pw_pinyin = self._g2pw(han)
if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
return super(Converter, self).convert(
han, Style.TONE, heteronym, errors, strict, **kwargs)
return super(Converter, self).convert(han, Style.TONE, heteronym, errors, strict, **kwargs)
for i, item in enumerate(g2pw_pinyin[0]):
if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
py = super(Converter, self).convert(
han[i], Style.TONE, heteronym, errors, strict, **kwargs)
py = super(Converter, self).convert(han[i], Style.TONE, heteronym, errors, strict, **kwargs)
pinyins.extend(py)
else:
pinyins.append([to_tone(item)])
@@ -104,7 +104,7 @@ def _remove_dup_and_empty(lst_list):
if lst:
new_lst_list.append(lst)
else:
new_lst_list.append([''])
new_lst_list.append([""])
return new_lst_list
@@ -127,17 +127,17 @@ def get_dict():
def read_dict():
polyphonic_dict = {}
with open(PP_DICT_PATH,encoding="utf-8") as f:
with open(PP_DICT_PATH, encoding="utf-8") as f:
line = f.readline()
while line:
key, value_str = line.split(':')
key, value_str = line.split(":")
value = eval(value_str.strip())
polyphonic_dict[key.strip()] = value
line = f.readline()
with open(PP_FIX_DICT_PATH,encoding="utf-8") as f:
with open(PP_FIX_DICT_PATH, encoding="utf-8") as f:
line = f.readline()
while line:
key, value_str = line.split(':')
key, value_str = line.split(":")
value = eval(value_str.strip())
polyphonic_dict[key.strip()] = value
line = f.readline()

View File

@@ -2,44 +2,43 @@
# This code is modified from https://github.com/GitYCC/g2pW
import warnings
warnings.filterwarnings("ignore")
import json
import os
import zipfile,requests
from typing import Any
from typing import Dict
from typing import List
from typing import Tuple
import zipfile
from typing import Any, Dict, List, Tuple
import numpy as np
import onnxruntime
import requests
onnxruntime.set_default_logger_severity(3)
from opencc import OpenCC
from pypinyin import Style, pinyin
from transformers import AutoTokenizer
from pypinyin import pinyin
from pypinyin import Style
from .dataset import get_char_phoneme_labels
from .dataset import get_phoneme_labels
from .dataset import prepare_onnx_input
from .utils import load_config
from ..zh_normalization.char_convert import tranditional_to_simplified
from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input
from .utils import load_config
model_version = '1.1'
model_version = "1.1"
def predict(session, onnx_input: Dict[str, Any],
labels: List[str]) -> Tuple[List[str], List[float]]:
def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[List[str], List[float]]:
all_preds = []
all_confidences = []
probs = session.run([], {
"input_ids": onnx_input['input_ids'],
"token_type_ids": onnx_input['token_type_ids'],
"attention_mask": onnx_input['attention_masks'],
"phoneme_mask": onnx_input['phoneme_masks'],
"char_ids": onnx_input['char_ids'],
"position_ids": onnx_input['position_ids']
})[0]
probs = session.run(
[],
{
"input_ids": onnx_input["input_ids"],
"token_type_ids": onnx_input["token_type_ids"],
"attention_mask": onnx_input["attention_masks"],
"phoneme_mask": onnx_input["phoneme_masks"],
"char_ids": onnx_input["char_ids"],
"position_ids": onnx_input["position_ids"],
},
)[0]
preds = np.argmax(probs, axis=1).tolist()
max_probs = []
@@ -51,17 +50,17 @@ def predict(session, onnx_input: Dict[str, Any],
return all_preds, all_confidences
def download_and_decompress(model_dir: str='G2PWModel/'):
def download_and_decompress(model_dir: str = "G2PWModel/"):
if not os.path.exists(model_dir):
parent_directory = os.path.dirname(model_dir)
zip_dir = os.path.join(parent_directory,"G2PWModel_1.1.zip")
extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory,"G2PWModel")
zip_dir = os.path.join(parent_directory, "G2PWModel_1.1.zip")
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
print("Downloading g2pw model...")
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip" # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
with requests.get(modelscope_url, stream=True) as r:
r.raise_for_status()
with open(zip_dir, 'wb') as f:
with open(zip_dir, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
@@ -69,17 +68,20 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
print("Extracting g2pw model...")
with zipfile.ZipFile(zip_dir, "r") as zip_ref:
zip_ref.extractall(parent_directory)
os.rename(extract_dir, extract_dir_new)
return model_dir
class G2PWOnnxConverter:
def __init__(self,
model_dir: str='G2PWModel/',
style: str='bopomofo',
model_source: str=None,
enable_non_tradional_chinese: bool=False):
def __init__(
self,
model_dir: str = "G2PWModel/",
style: str = "bopomofo",
model_source: str = None,
enable_non_tradional_chinese: bool = False,
):
uncompress_path = download_and_decompress(model_dir)
sess_options = onnxruntime.SessionOptions()
@@ -87,41 +89,59 @@ class G2PWOnnxConverter:
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
sess_options.intra_op_num_threads = 2
try:
self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
except:
self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CPUExecutionProvider'])
self.config = load_config(
config_path=os.path.join(uncompress_path, 'config.py'),
use_default=True)
self.session_g2pW = onnxruntime.InferenceSession(
os.path.join(uncompress_path, "g2pW.onnx"),
sess_options=sess_options,
providers=["CPUExecutionProvider"],
)
self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
self.model_source = model_source if model_source else self.config.model_source
self.enable_opencc = enable_non_tradional_chinese
self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
polyphonic_chars_path = os.path.join(uncompress_path,
'POLYPHONIC_CHARS.txt')
monophonic_chars_path = os.path.join(uncompress_path,
'MONOPHONIC_CHARS.txt')
polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
self.polyphonic_chars = [
line.split('\t')
for line in open(polyphonic_chars_path, encoding='utf-8').read()
.strip().split('\n')
line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
]
self.non_polyphonic = {
'', '', '', '', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', ''
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
}
self.non_monophonic = {'', ''}
self.non_monophonic = {"", ""}
self.monophonic_chars = [
line.split('\t')
for line in open(monophonic_chars_path, encoding='utf-8').read()
.strip().split('\n')
line.split("\t") for line in open(monophonic_chars_path, encoding="utf-8").read().strip().split("\n")
]
self.labels, self.char2phonemes = get_char_phoneme_labels(
polyphonic_chars=self.polyphonic_chars
) if self.config.use_char_phoneme else get_phoneme_labels(
polyphonic_chars=self.polyphonic_chars)
self.labels, self.char2phonemes = (
get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
if self.config.use_char_phoneme
else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
)
self.chars = sorted(list(self.char2phonemes.keys()))
@@ -130,41 +150,29 @@ class G2PWOnnxConverter:
if char in self.polyphonic_chars_new:
self.polyphonic_chars_new.remove(char)
self.monophonic_chars_dict = {
char: phoneme
for char, phoneme in self.monophonic_chars
}
self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
for char in self.non_monophonic:
if char in self.monophonic_chars_dict:
self.monophonic_chars_dict.pop(char)
self.pos_tags = [
'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI'
]
self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
with open(
os.path.join(uncompress_path,
'bopomofo_to_pinyin_wo_tune_dict.json'),
'r',
encoding='utf-8') as fr:
with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
self.bopomofo_convert_dict = json.load(fr)
self.style_convert_func = {
'bopomofo': lambda x: x,
'pinyin': self._convert_bopomofo_to_pinyin,
"bopomofo": lambda x: x,
"pinyin": self._convert_bopomofo_to_pinyin,
}[style]
with open(
os.path.join(uncompress_path, 'char_bopomofo_dict.json'),
'r',
encoding='utf-8') as fr:
with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
self.char_bopomofo_dict = json.load(fr)
if self.enable_opencc:
self.cc = OpenCC('s2tw')
self.cc = OpenCC("s2tw")
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
tone = bopomofo[-1]
assert tone in '12345'
assert tone in "12345"
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
if component:
return component + tone
@@ -184,8 +192,7 @@ class G2PWOnnxConverter:
translated_sentences.append(translated_sent)
sentences = translated_sentences
texts, query_ids, sent_ids, partial_results = self._prepare_data(
sentences=sentences)
texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
if len(texts) == 0:
# sentences no polyphonic words
return partial_results
@@ -198,14 +205,12 @@ class G2PWOnnxConverter:
texts=texts,
query_ids=query_ids,
use_mask=self.config.use_mask,
window_size=None)
window_size=None,
)
preds, confidences = predict(
session=self.session_g2pW,
onnx_input=onnx_input,
labels=self.labels)
preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
if self.config.use_char_phoneme:
preds = [pred.split(' ')[1] for pred in preds]
preds = [pred.split(" ")[1] for pred in preds]
results = partial_results
for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
@@ -213,15 +218,12 @@ class G2PWOnnxConverter:
return results
def _prepare_data(
self, sentences: List[str]
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
texts, query_ids, sent_ids, partial_results = [], [], [], []
for sent_id, sent in enumerate(sentences):
# pypinyin works well for Simplified Chinese than Traditional Chinese
sent_s = tranditional_to_simplified(sent)
pypinyin_result = pinyin(
sent_s, neutral_tone_with_five=True, style=Style.TONE3)
pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
partial_result = [None] * len(sent)
for i, char in enumerate(sent):
if char in self.polyphonic_chars_new:
@@ -229,8 +231,7 @@ class G2PWOnnxConverter:
query_ids.append(i)
sent_ids.append(sent_id)
elif char in self.monophonic_chars_dict:
partial_result[i] = self.style_convert_func(
self.monophonic_chars_dict[char])
partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
elif char in self.char_bopomofo_dict:
partial_result[i] = pypinyin_result[i][0]
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])

View File

@@ -15,6 +15,7 @@
Credits
This code is modified from https://github.com/GitYCC/g2pW
"""
import os
import re
@@ -24,14 +25,14 @@ def wordize_and_map(text: str):
index_map_from_text_to_word = []
index_map_from_word_to_text = []
while len(text) > 0:
match_space = re.match(r'^ +', text)
match_space = re.match(r"^ +", text)
if match_space:
space_str = match_space.group(0)
index_map_from_text_to_word += [None] * len(space_str)
text = text[len(space_str):]
text = text[len(space_str) :]
continue
match_en = re.match(r'^[a-zA-Z0-9]+', text)
match_en = re.match(r"^[a-zA-Z0-9]+", text)
if match_en:
en_word = match_en.group(0)
@@ -42,7 +43,7 @@ def wordize_and_map(text: str):
index_map_from_text_to_word += [len(words)] * len(en_word)
words.append(en_word)
text = text[len(en_word):]
text = text[len(en_word) :]
else:
word_start_pos = len(index_map_from_text_to_word)
word_end_pos = word_start_pos + 1
@@ -63,15 +64,14 @@ def tokenize_and_map(tokenizer, text: str):
for word, (word_start, word_end) in zip(words, word2text):
word_tokens = tokenizer.tokenize(word)
if len(word_tokens) == 0 or word_tokens == ['[UNK]']:
if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
index_map_from_token_to_text.append((word_start, word_end))
tokens.append('[UNK]')
tokens.append("[UNK]")
else:
current_word_start = word_start
for word_token in word_tokens:
word_token_len = len(re.sub(r'^##', '', word_token))
index_map_from_token_to_text.append(
(current_word_start, current_word_start + word_token_len))
word_token_len = len(re.sub(r"^##", "", word_token))
index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len))
current_word_start = current_word_start + word_token_len
tokens.append(word_token)
@@ -85,53 +85,51 @@ def tokenize_and_map(tokenizer, text: str):
def _load_config(config_path: os.PathLike):
import importlib.util
spec = importlib.util.spec_from_file_location('__init__', config_path)
spec = importlib.util.spec_from_file_location("__init__", config_path)
config = importlib.util.module_from_spec(spec)
spec.loader.exec_module(config)
return config
default_config_dict = {
'manual_seed': 1313,
'model_source': 'bert-base-chinese',
'window_size': 32,
'num_workers': 2,
'use_mask': True,
'use_char_phoneme': False,
'use_conditional': True,
'param_conditional': {
'affect_location': 'softmax',
'bias': True,
'char-linear': True,
'pos-linear': False,
'char+pos-second': True,
'char+pos-second_lowrank': False,
'lowrank_size': 0,
'char+pos-second_fm': False,
'fm_size': 0,
'fix_mode': None,
'count_json': 'train.count.json'
"manual_seed": 1313,
"model_source": "bert-base-chinese",
"window_size": 32,
"num_workers": 2,
"use_mask": True,
"use_char_phoneme": False,
"use_conditional": True,
"param_conditional": {
"affect_location": "softmax",
"bias": True,
"char-linear": True,
"pos-linear": False,
"char+pos-second": True,
"char+pos-second_lowrank": False,
"lowrank_size": 0,
"char+pos-second_fm": False,
"fm_size": 0,
"fix_mode": None,
"count_json": "train.count.json",
},
'lr': 5e-5,
'val_interval': 200,
'num_iter': 10000,
'use_focal': False,
'param_focal': {
'alpha': 0.0,
'gamma': 0.7
"lr": 5e-5,
"val_interval": 200,
"num_iter": 10000,
"use_focal": False,
"param_focal": {"alpha": 0.0, "gamma": 0.7},
"use_pos": True,
"param_pos ": {
"weight": 0.1,
"pos_joint_training": True,
"train_pos_path": "train.pos",
"valid_pos_path": "dev.pos",
"test_pos_path": "test.pos",
},
'use_pos': True,
'param_pos ': {
'weight': 0.1,
'pos_joint_training': True,
'train_pos_path': 'train.pos',
'valid_pos_path': 'dev.pos',
'test_pos_path': 'test.pos'
}
}
def load_config(config_path: os.PathLike, use_default: bool=False):
def load_config(config_path: os.PathLike, use_default: bool = False):
config = _load_config(config_path)
if use_default:
for attr, val in default_config_dict.items():

View File

@@ -2,43 +2,51 @@
import re
import os
import hashlib
try:
import pyopenjtalk
current_file_path = os.path.dirname(__file__)
# 防止win下无法读取模型
if os.name == 'nt':
if os.name == "nt":
python_dir = os.getcwd()
OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8")
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', OPEN_JTALK_DICT_DIR)):
if (OPEN_JTALK_DICT_DIR[:len(python_dir)].upper() == python_dir.upper()):
OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR,python_dir))
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", OPEN_JTALK_DICT_DIR)):
if OPEN_JTALK_DICT_DIR[: len(python_dir)].upper() == python_dir.upper():
OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR, python_dir))
else:
import shutil
if not os.path.exists('TEMP'):
os.mkdir('TEMP')
if not os.path.exists("TEMP"):
os.mkdir("TEMP")
if not os.path.exists(os.path.join("TEMP", "ja")):
os.mkdir(os.path.join("TEMP", "ja"))
if os.path.exists(os.path.join("TEMP", "ja", "open_jtalk_dic")):
shutil.rmtree(os.path.join("TEMP", "ja", "open_jtalk_dic"))
shutil.copytree(pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"), os.path.join("TEMP", "ja", "open_jtalk_dic"), )
shutil.copytree(
pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"),
os.path.join("TEMP", "ja", "open_jtalk_dic"),
)
OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic")
pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8")
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', current_file_path)):
if (current_file_path[:len(python_dir)].upper() == python_dir.upper()):
current_file_path = os.path.join(os.path.relpath(current_file_path,python_dir))
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", current_file_path)):
if current_file_path[: len(python_dir)].upper() == python_dir.upper():
current_file_path = os.path.join(os.path.relpath(current_file_path, python_dir))
else:
if not os.path.exists('TEMP'):
os.mkdir('TEMP')
if not os.path.exists("TEMP"):
os.mkdir("TEMP")
if not os.path.exists(os.path.join("TEMP", "ja")):
os.mkdir(os.path.join("TEMP", "ja"))
if not os.path.exists(os.path.join("TEMP", "ja", "ja_userdic")):
os.mkdir(os.path.join("TEMP", "ja", "ja_userdic"))
shutil.copyfile(os.path.join(current_file_path, "ja_userdic", "userdict.csv"),os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"))
shutil.copyfile(
os.path.join(current_file_path, "ja_userdic", "userdict.csv"),
os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"),
)
current_file_path = os.path.join("TEMP", "ja")
def get_hash(fp: str) -> str:
hash_md5 = hashlib.md5()
with open(fp, "rb") as f:
@@ -51,21 +59,26 @@ try:
USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
# 如果没有用户词典就生成一个如果有就检查md5如果不一样就重新生成
if os.path.exists(USERDIC_CSV_PATH):
if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read():
if (
not os.path.exists(USERDIC_BIN_PATH)
or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read()
):
pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f:
with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f:
f.write(get_hash(USERDIC_CSV_PATH))
if os.path.exists(USERDIC_BIN_PATH):
pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
except Exception as e:
pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
except Exception:
# print(e)
import pyopenjtalk
# failed to load user dictionary, ignore.
pass
from text.symbols import punctuation
# Regular expression matching Japanese without punctuation marks:
_japanese_characters = re.compile(
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
@@ -123,9 +136,9 @@ def post_replace_ph(ph):
def replace_consecutive_punctuation(text):
punctuations = ''.join(re.escape(p) for p in punctuation)
pattern = f'([{punctuations}])([{punctuations}])+'
result = re.sub(pattern, r'\1', text)
punctuations = "".join(re.escape(p) for p in punctuation)
pattern = f"([{punctuations}])([{punctuations}])+"
result = re.sub(pattern, r"\1", text)
return result
@@ -152,7 +165,7 @@ def preprocess_jap(text, with_prosody=False):
text += p.split(" ")
if i < len(marks):
if marks[i] == " ":# 防止意外的UNK
if marks[i] == " ": # 防止意外的UNK
continue
text += [marks[i].replace(" ", "")]
return text
@@ -165,6 +178,7 @@ def text_normalize(text):
text = replace_consecutive_punctuation(text)
return text
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
@@ -241,6 +255,7 @@ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
return phones
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
def _numeric_feature_by_regex(regex, s):
match = re.search(regex, s)
@@ -248,6 +263,7 @@ def _numeric_feature_by_regex(regex, s):
return -50
return int(match.group(1))
def g2p(norm_text, with_prosody=True):
phones = preprocess_jap(norm_text, with_prosody)
phones = [post_replace_ph(i) for i in phones]

View File

@@ -9,39 +9,43 @@ import importlib
import os
# 防止win下无法读取模型
if os.name == 'nt':
if os.name == "nt":
class win_G2p(G2p):
def check_mecab(self):
super().check_mecab()
spam_spec = importlib.util.find_spec("eunjeon")
non_found = spam_spec is None
if non_found:
print(f'you have to install eunjeon. install it...')
print("you have to install eunjeon. install it...")
else:
installpath = spam_spec.submodule_search_locations[0]
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
import sys
from eunjeon import Mecab as _Mecab
class Mecab(_Mecab):
def get_dicpath(installpath):
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
import shutil
python_dir = os.getcwd()
if (installpath[:len(python_dir)].upper() == python_dir.upper()):
dicpath = os.path.join(os.path.relpath(installpath,python_dir),'data','mecabrc')
else:
if not os.path.exists('TEMP'):
os.mkdir('TEMP')
if not os.path.exists(os.path.join('TEMP', 'ko')):
os.mkdir(os.path.join('TEMP', 'ko'))
if os.path.exists(os.path.join('TEMP', 'ko', 'ko_dict')):
shutil.rmtree(os.path.join('TEMP', 'ko', 'ko_dict'))
shutil.copytree(os.path.join(installpath, 'data'), os.path.join('TEMP', 'ko', 'ko_dict'))
dicpath = os.path.join('TEMP', 'ko', 'ko_dict', 'mecabrc')
python_dir = os.getcwd()
if installpath[: len(python_dir)].upper() == python_dir.upper():
dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc")
else:
if not os.path.exists("TEMP"):
os.mkdir("TEMP")
if not os.path.exists(os.path.join("TEMP", "ko")):
os.mkdir(os.path.join("TEMP", "ko"))
if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")):
shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict"))
shutil.copytree(
os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict")
)
dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc")
else:
dicpath=os.path.abspath(os.path.join(installpath, 'data/mecabrc'))
dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc"))
return dicpath
def __init__(self, dicpath=get_dicpath(installpath)):
@@ -52,97 +56,108 @@ if os.name == 'nt':
G2p = win_G2p
from text.symbols2 import symbols
from text.symbols2 import symbols
# This is a list of Korean classifiers preceded by pure Korean numerals.
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
_korean_classifiers = (
"군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통"
)
# List of (hangul, hangul divided) pairs:
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
# ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
# ('ㄵ', 'ㄴㅈ'),
# ('', 'ㄴㅎ'),
# ('', 'ㄹㄱ'),
# ('', 'ㄹㅁ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㅂㅅ'),
('', 'ㅗㅏ'),
('', 'ㅗㅐ'),
('', 'ㅗㅣ'),
('', 'ㅜㅓ'),
('', 'ㅜㅔ'),
('', 'ㅜㅣ'),
('', 'ㅡㅣ'),
('', 'ㅣㅏ'),
('', 'ㅣㅐ'),
('', 'ㅣㅓ'),
('', 'ㅣㅔ'),
('', 'ㅣㅗ'),
('', 'ㅣㅜ')
]]
_hangul_divided = [
(re.compile("%s" % x[0]), x[1])
for x in [
# ('', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
# ('', 'ㄴㅈ'),
# ('', 'ㄴㅎ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹㅍ'),
# ('ㅀ', 'ㄹㅎ'),
# ('ㅄ', 'ㅂㅅ'),
("", "ㅗㅏ"),
("", "ㅗㅐ"),
("", "ㅗㅣ"),
("", "ㅜㅓ"),
("", "ㅜㅔ"),
("", "ㅜㅣ"),
("", "ㅡㅣ"),
("", "ㅣㅏ"),
("", "ㅣㅐ"),
("", "ㅣㅓ"),
("", "ㅣㅔ"),
("", "ㅣㅗ"),
("", "ㅣㅜ"),
]
]
# List of (Latin alphabet, hangul) pairs:
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('a', '에이'),
('b', ''),
('c', ''),
('d', ''),
('e', ''),
('f', '에프'),
('g', ''),
('h', '에이치'),
('i', '아이'),
('j', '제이'),
('k', '케이'),
('l', ''),
('m', ''),
('n', ''),
('o', ''),
('p', ''),
('q', ''),
('r', '아르'),
('s', '에스'),
('t', ''),
('u', ''),
('v', '브이'),
('w', '더블유'),
('x', '엑스'),
('y', '와이'),
('z', '제트')
]]
_latin_to_hangul = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("a", "에이"),
("b", ""),
("c", ""),
("d", ""),
("e", ""),
("f", "에프"),
("g", ""),
("h", "에이치"),
("i", "아이"),
("j", "제이"),
("k", "케이"),
("l", ""),
("m", ""),
("n", ""),
("o", ""),
("p", ""),
("q", ""),
("r", "아르"),
("s", "에스"),
("t", ""),
("u", ""),
("v", "브이"),
("w", "더블유"),
("x", "엑스"),
("y", "와이"),
("z", "제트"),
]
]
# List of (ipa, lazy ipa) pairs:
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('t͡ɕ','ʧ'),
('d͡ʑ','ʥ'),
('ɲ','n^'),
('ɕ','ʃ'),
('ʷ','w'),
('ɭ','l`'),
('ʎ','ɾ'),
('ɣ','ŋ'),
('ɰ','ɯ'),
('ʝ','j'),
('ʌ','ə'),
('ɡ','g'),
('\u031a','#'),
('\u0348','='),
('\u031e',''),
('\u0320',''),
('\u0339','')
]]
_ipa_to_lazy_ipa = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("t͡ɕ", "ʧ"),
("d͡ʑ", "ʥ"),
("ɲ", "n^"),
("ɕ", "ʃ"),
("ʷ", "w"),
("ɭ", "l`"),
("ʎ", "ɾ"),
("ɣ", "ŋ"),
("ɰ", "ɯ"),
("ʝ", "j"),
("ʌ", "ə"),
("ɡ", "g"),
("\u031a", "#"),
("\u0348", "="),
("\u031e", ""),
("\u0320", ""),
("\u0339", ""),
]
]
def fix_g2pk2_error(text):
new_text = ""
i = 0
while i < len(text) - 4:
if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == '':
new_text += text[i:i+3] + ' ' + ''
if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "":
new_text += text[i : i + 3] + " " + ""
i += 5
else:
new_text += text[i]
@@ -166,20 +181,20 @@ def divide_hangul(text):
def hangul_number(num, sino=True):
'''Reference https://github.com/Kyubyong/g2pK'''
num = re.sub(',', '', num)
"""Reference https://github.com/Kyubyong/g2pK"""
num = re.sub(",", "", num)
if num == '0':
return ''
if not sino and num == '20':
return '스무'
if num == "0":
return ""
if not sino and num == "20":
return "스무"
digits = '123456789'
names = '일이삼사오육칠팔구'
digits = "123456789"
names = "일이삼사오육칠팔구"
digit2name = {d: n for d, n in zip(digits, names)}
modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉"
decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔"
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
@@ -188,75 +203,75 @@ def hangul_number(num, sino=True):
i = len(num) - i - 1
if sino:
if i == 0:
name = digit2name.get(digit, '')
name = digit2name.get(digit, "")
elif i == 1:
name = digit2name.get(digit, '') + ''
name = name.replace('일십', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일십", "")
else:
if i == 0:
name = digit2mod.get(digit, '')
name = digit2mod.get(digit, "")
elif i == 1:
name = digit2dec.get(digit, '')
if digit == '0':
name = digit2dec.get(digit, "")
if digit == "0":
if i % 4 == 0:
last_three = spelledout[-min(3, len(spelledout)):]
if ''.join(last_three) == '':
spelledout.append('')
last_three = spelledout[-min(3, len(spelledout)) :]
if "".join(last_three) == "":
spelledout.append("")
continue
else:
spelledout.append('')
spelledout.append("")
continue
if i == 2:
name = digit2name.get(digit, '') + ''
name = name.replace('일백', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일백", "")
elif i == 3:
name = digit2name.get(digit, '') + ''
name = name.replace('일천', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일천", "")
elif i == 4:
name = digit2name.get(digit, '') + ''
name = name.replace('일만', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일만", "")
elif i == 5:
name = digit2name.get(digit, '') + ''
name = name.replace('일십', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일십", "")
elif i == 6:
name = digit2name.get(digit, '') + ''
name = name.replace('일백', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일백", "")
elif i == 7:
name = digit2name.get(digit, '') + ''
name = name.replace('일천', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일천", "")
elif i == 8:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 9:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 10:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 11:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 12:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 13:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 14:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 15:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
spelledout.append(name)
return ''.join(elem for elem in spelledout)
return "".join(elem for elem in spelledout)
def number_to_hangul(text):
'''Reference https://github.com/Kyubyong/g2pK'''
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
"""Reference https://github.com/Kyubyong/g2pK"""
tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text))
for token in tokens:
num, classifier = token
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
spelledout = hangul_number(num, sino=False)
else:
spelledout = hangul_number(num, sino=True)
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}")
# digit by digit for remaining digits
digits = '0123456789'
names = '영일이삼사오육칠팔구'
digits = "0123456789"
names = "영일이삼사오육칠팔구"
for d, n in zip(digits, names):
text = text.replace(d, n)
return text
@@ -265,19 +280,23 @@ def number_to_hangul(text):
def korean_to_lazy_ipa(text):
text = latin_to_hangul(text)
text = number_to_hangul(text)
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text)
for regex, replacement in _ipa_to_lazy_ipa:
text = re.sub(regex, replacement, text)
return text
_g2p=G2p()
_g2p = G2p()
def korean_to_ipa(text):
text = latin_to_hangul(text)
text = number_to_hangul(text)
text = _g2p(text)
text = fix_g2pk2_error(text)
text = korean_to_lazy_ipa(text)
return text.replace('ʧ','').replace('ʥ','')
return text.replace("ʧ", "").replace("ʥ", "")
def post_replace_ph(ph):
rep_map = {
@@ -301,12 +320,13 @@ def post_replace_ph(ph):
ph = ""
return ph
def g2p(text):
text = latin_to_hangul(text)
text = _g2p(text)
text = divide_hangul(text)
text = fix_g2pk2_error(text)
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
text = re.sub(r"([\u3131-\u3163])$", r"\1.", text)
# text = "".join([post_replace_ph(i) for i in text])
text = [post_replace_ph(i) for i in text]
return text
@@ -314,4 +334,4 @@ def g2p(text):
if __name__ == "__main__":
text = "안녕하세요"
print(g2p(text))
print(g2p(text))

View File

@@ -1,5 +1,3 @@
import os
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
punctuation = ["!", "?", "", ",", "."] # @是SP停顿
punctuation.append("-")

View File

@@ -1,5 +1,3 @@
import os
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
punctuation = ["!", "?", "", ",", "."] # @是SP停顿
punctuation.append("-")
@@ -396,24 +394,404 @@ arpa = {
"SH",
}
ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停'
ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停"
# ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
yue_symbols={'Yeot3', 'Yip1', 'Yyu3', 'Yeng4', 'Yut5', 'Yaan5', 'Ym5', 'Yaan6', 'Yang1', 'Yun4', 'Yon2', 'Yui5', 'Yun2', 'Yat3', 'Ye', 'Yeot1', 'Yoeng5', 'Yoek2', 'Yam2', 'Yeon6', 'Yu6', 'Yiu3', 'Yaang6', 'Yp5', 'Yai4', 'Yoek4', 'Yit6', 'Yam5', 'Yoeng6', 'Yg1', 'Yk3', 'Yoe4', 'Yam3', 'Yc', 'Yyu4', 'Yyut1', 'Yiu4', 'Ying3', 'Yip3', 'Yaap3', 'Yau3', 'Yan4', 'Yau1', 'Yap4', 'Yk6', 'Yok3', 'Yai1', 'Yeot6', 'Yan2', 'Yoek6', 'Yt1', 'Yoi1', 'Yit5', 'Yn4', 'Yaau3', 'Yau4', 'Yuk6', 'Ys', 'Yuk', 'Yin6', 'Yung6', 'Ya', 'You', 'Yaai5', 'Yau5', 'Yoi3', 'Yaak3', 'Yaat3', 'Ying2', 'Yok5', 'Yeng2', 'Yyut3', 'Yam1', 'Yip5', 'You1', 'Yam6', 'Yaa5', 'Yi6', 'Yek4', 'Yyu2', 'Yuk5', 'Yaam1', 'Yang2', 'Yai', 'Yiu6', 'Yin4', 'Yok4', 'Yot3', 'Yui2', 'Yeoi5', 'Yyun6', 'Yyu5', 'Yoi5', 'Yeot2', 'Yim4', 'Yeoi2', 'Yaan1', 'Yang6', 'Yong1', 'Yaang4', 'Yung5', 'Yeon1', 'Yin2', 'Ya3', 'Yaang3', 'Yg', 'Yk2', 'Yaau5', 'Yut1', 'Yt5', 'Yip4', 'Yung4', 'Yj', 'Yong3', 'Ya1', 'Yg6', 'Yaau6', 'Yit3', 'Yun3', 'Ying1', 'Yn2', 'Yg4', 'Yl', 'Yp3', 'Yn3', 'Yak1', 'Yang5', 'Yoe6', 'You2', 'Yap2', 'Yak2', 'Yt3', 'Yot5', 'Yim2', 'Yi1', 'Yn6', 'Yaat5', 'Yaam3', 'Yoek5', 'Ye3', 'Yeon4', 'Yaa2', 'Yu3', 'Yim6', 'Ym', 'Yoe3', 'Yaai2', 'Ym2', 'Ya6', 'Yeng6', 'Yik4', 'Yot4', 'Yaai4', 'Yyun3', 'Yu1', 'Yoeng1', 'Yaap2', 'Yuk3', 'Yoek3', 'Yeng5', 'Yeoi1', 'Yiu2', 'Yok1', 'Yo1', 'Yoek1', 'Yoeng2', 'Yeon5', 'Yiu1', 'Yoeng4', 'Yuk2', 'Yat4', 'Yg5', 'Yut4', 'Yan6', 'Yin3', 'Yaa6', 'Yap1', 'Yg2', 'Yoe5', 'Yt4', 'Ya5', 'Yo4', 'Yyu1', 'Yak3', 'Yeon2', 'Yong4', 'Ym1', 'Ye2', 'Yaang5', 'Yoi2', 'Yeng3', 'Yn', 'Yyut4', 'Yau', 'Yaak2', 'Yaan4', 'Yek2', 'Yin1', 'Yi5', 'Yoe2', 'Yei5', 'Yaat6', 'Yak5', 'Yp6', 'Yok6', 'Yei2', 'Yaap1', 'Yyut5', 'Yi4', 'Yim1', 'Yk5', 'Ye4', 'Yok2', 'Yaam6', 'Yat2', 'Yon6', 'Yei3', 'Yyu6', 'Yeot5', 'Yk4', 'Yai6', 'Yd', 'Yg3', 'Yei6', 'Yau2', 'Yok', 'Yau6', 'Yung3', 'Yim5', 'Yut6', 'Yit1', 'Yon3', 'Yat1', 'Yaam2', 'Yyut2', 'Yui6', 'Yt2', 'Yek6', 'Yt', 'Ye6', 'Yang3', 'Ying6', 'Yaau1', 'Yeon3', 'Yng', 'Yh', 'Yang4', 'Ying5', 'Yaap6', 'Yoeng3', 'Yyun4', 'You3', 'Yan5', 'Yat5', 'Yot1', 'Yun1', 'Yi3', 'Yaa1', 'Yaap4', 'You6', 'Yaang2', 'Yaap5', 'Yaa3', 'Yaak6', 'Yeng1', 'Yaak1', 'Yo5', 'Yoi4', 'Yam4', 'Yik1', 'Ye1', 'Yai5', 'Yung1', 'Yp2', 'Yui4', 'Yaak4', 'Yung2', 'Yak4', 'Yaat4', 'Yeoi4', 'Yut2', 'Yin5', 'Yaau4', 'Yap6', 'Yb', 'Yaam4', 'Yw', 'Yut3', 'Yong2', 'Yt6', 'Yaai6', 'Yap5', 'Yik5', 'Yun6', 'Yaam5', 'Yun5', 'Yik3', 'Ya2', 'Yyut6', 'Yon4', 'Yk1', 'Yit4', 'Yak6', 'Yaan2', 'Yuk1', 'Yai2', 'Yik2', 'Yaat2', 'Yo3', 'Ykw', 'Yn5', 'Yaa', 'Ye5', 'Yu4', 'Yei1', 'Yai3', 'Yyun5', 'Yip2', 'Yaau2', 'Yiu5', 'Ym4', 'Yeoi6', 'Yk', 'Ym6', 'Yoe1', 'Yeoi3', 'Yon', 'Yuk4', 'Yaai3', 'Yaa4', 'Yot6', 'Yaang1', 'Yei4', 'Yek1', 'Yo', 'Yp', 'Yo6', 'Yp4', 'Yan3', 'Yoi', 'Yap3', 'Yek3', 'Yim3', 'Yz', 'Yot2', 'Yoi6', 'Yit2', 'Yu5', 'Yaan3', 'Yan1', 'Yon5', 'Yp1', 'Yong5', 'Ygw', 'Yak', 'Yat6', 'Ying4', 'Yu2', 'Yf', 'Ya4', 'Yon1', 'You4', 'Yik6', 'Yui1', 'Yaat1', 'Yeot4', 'Yi2', 'Yaai1', 'Yek5', 'Ym3', 'Yong6', 'You5', 'Yyun1', 'Yn1', 'Yo2', 'Yip6', 'Yui3', 'Yaak5', 'Yyun2'}
yue_symbols = {
"Yeot3",
"Yip1",
"Yyu3",
"Yeng4",
"Yut5",
"Yaan5",
"Ym5",
"Yaan6",
"Yang1",
"Yun4",
"Yon2",
"Yui5",
"Yun2",
"Yat3",
"Ye",
"Yeot1",
"Yoeng5",
"Yoek2",
"Yam2",
"Yeon6",
"Yu6",
"Yiu3",
"Yaang6",
"Yp5",
"Yai4",
"Yoek4",
"Yit6",
"Yam5",
"Yoeng6",
"Yg1",
"Yk3",
"Yoe4",
"Yam3",
"Yc",
"Yyu4",
"Yyut1",
"Yiu4",
"Ying3",
"Yip3",
"Yaap3",
"Yau3",
"Yan4",
"Yau1",
"Yap4",
"Yk6",
"Yok3",
"Yai1",
"Yeot6",
"Yan2",
"Yoek6",
"Yt1",
"Yoi1",
"Yit5",
"Yn4",
"Yaau3",
"Yau4",
"Yuk6",
"Ys",
"Yuk",
"Yin6",
"Yung6",
"Ya",
"You",
"Yaai5",
"Yau5",
"Yoi3",
"Yaak3",
"Yaat3",
"Ying2",
"Yok5",
"Yeng2",
"Yyut3",
"Yam1",
"Yip5",
"You1",
"Yam6",
"Yaa5",
"Yi6",
"Yek4",
"Yyu2",
"Yuk5",
"Yaam1",
"Yang2",
"Yai",
"Yiu6",
"Yin4",
"Yok4",
"Yot3",
"Yui2",
"Yeoi5",
"Yyun6",
"Yyu5",
"Yoi5",
"Yeot2",
"Yim4",
"Yeoi2",
"Yaan1",
"Yang6",
"Yong1",
"Yaang4",
"Yung5",
"Yeon1",
"Yin2",
"Ya3",
"Yaang3",
"Yg",
"Yk2",
"Yaau5",
"Yut1",
"Yt5",
"Yip4",
"Yung4",
"Yj",
"Yong3",
"Ya1",
"Yg6",
"Yaau6",
"Yit3",
"Yun3",
"Ying1",
"Yn2",
"Yg4",
"Yl",
"Yp3",
"Yn3",
"Yak1",
"Yang5",
"Yoe6",
"You2",
"Yap2",
"Yak2",
"Yt3",
"Yot5",
"Yim2",
"Yi1",
"Yn6",
"Yaat5",
"Yaam3",
"Yoek5",
"Ye3",
"Yeon4",
"Yaa2",
"Yu3",
"Yim6",
"Ym",
"Yoe3",
"Yaai2",
"Ym2",
"Ya6",
"Yeng6",
"Yik4",
"Yot4",
"Yaai4",
"Yyun3",
"Yu1",
"Yoeng1",
"Yaap2",
"Yuk3",
"Yoek3",
"Yeng5",
"Yeoi1",
"Yiu2",
"Yok1",
"Yo1",
"Yoek1",
"Yoeng2",
"Yeon5",
"Yiu1",
"Yoeng4",
"Yuk2",
"Yat4",
"Yg5",
"Yut4",
"Yan6",
"Yin3",
"Yaa6",
"Yap1",
"Yg2",
"Yoe5",
"Yt4",
"Ya5",
"Yo4",
"Yyu1",
"Yak3",
"Yeon2",
"Yong4",
"Ym1",
"Ye2",
"Yaang5",
"Yoi2",
"Yeng3",
"Yn",
"Yyut4",
"Yau",
"Yaak2",
"Yaan4",
"Yek2",
"Yin1",
"Yi5",
"Yoe2",
"Yei5",
"Yaat6",
"Yak5",
"Yp6",
"Yok6",
"Yei2",
"Yaap1",
"Yyut5",
"Yi4",
"Yim1",
"Yk5",
"Ye4",
"Yok2",
"Yaam6",
"Yat2",
"Yon6",
"Yei3",
"Yyu6",
"Yeot5",
"Yk4",
"Yai6",
"Yd",
"Yg3",
"Yei6",
"Yau2",
"Yok",
"Yau6",
"Yung3",
"Yim5",
"Yut6",
"Yit1",
"Yon3",
"Yat1",
"Yaam2",
"Yyut2",
"Yui6",
"Yt2",
"Yek6",
"Yt",
"Ye6",
"Yang3",
"Ying6",
"Yaau1",
"Yeon3",
"Yng",
"Yh",
"Yang4",
"Ying5",
"Yaap6",
"Yoeng3",
"Yyun4",
"You3",
"Yan5",
"Yat5",
"Yot1",
"Yun1",
"Yi3",
"Yaa1",
"Yaap4",
"You6",
"Yaang2",
"Yaap5",
"Yaa3",
"Yaak6",
"Yeng1",
"Yaak1",
"Yo5",
"Yoi4",
"Yam4",
"Yik1",
"Ye1",
"Yai5",
"Yung1",
"Yp2",
"Yui4",
"Yaak4",
"Yung2",
"Yak4",
"Yaat4",
"Yeoi4",
"Yut2",
"Yin5",
"Yaau4",
"Yap6",
"Yb",
"Yaam4",
"Yw",
"Yut3",
"Yong2",
"Yt6",
"Yaai6",
"Yap5",
"Yik5",
"Yun6",
"Yaam5",
"Yun5",
"Yik3",
"Ya2",
"Yyut6",
"Yon4",
"Yk1",
"Yit4",
"Yak6",
"Yaan2",
"Yuk1",
"Yai2",
"Yik2",
"Yaat2",
"Yo3",
"Ykw",
"Yn5",
"Yaa",
"Ye5",
"Yu4",
"Yei1",
"Yai3",
"Yyun5",
"Yip2",
"Yaau2",
"Yiu5",
"Ym4",
"Yeoi6",
"Yk",
"Ym6",
"Yoe1",
"Yeoi3",
"Yon",
"Yuk4",
"Yaai3",
"Yaa4",
"Yot6",
"Yaang1",
"Yei4",
"Yek1",
"Yo",
"Yp",
"Yo6",
"Yp4",
"Yan3",
"Yoi",
"Yap3",
"Yek3",
"Yim3",
"Yz",
"Yot2",
"Yoi6",
"Yit2",
"Yu5",
"Yaan3",
"Yan1",
"Yon5",
"Yp1",
"Yong5",
"Ygw",
"Yak",
"Yat6",
"Ying4",
"Yu2",
"Yf",
"Ya4",
"Yon1",
"You4",
"Yik6",
"Yui1",
"Yaat1",
"Yeot4",
"Yi2",
"Yaai1",
"Yek5",
"Ym3",
"Yong6",
"You5",
"Yyun1",
"Yn1",
"Yo2",
"Yip6",
"Yui3",
"Yaak5",
"Yyun2",
}
# symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
symbols = sorted(set(symbols))
# print(len(symbols))
symbols+=["[","]"]##日文新增上升下降调型
symbols+=sorted(list(ko_symbols))
symbols+=sorted(list(yue_symbols))##新加的yue统一摆在后头#已查过开头加Y后没有重复韩文显然不会重复
symbols += ["[", "]"] ##日文新增上升下降调型
symbols += sorted(list(ko_symbols))
symbols += sorted(list(yue_symbols)) ##新加的yue统一摆在后头#已查过开头加Y后没有重复韩文显然不会重复
# print(len(symbols))
if __name__ == "__main__":
print(len(symbols))
'''
"""
粤语:
732-353=379
韩文+粤语:
732-322=410
'''
"""

View File

@@ -510,12 +510,7 @@ class ToneSandhi:
# e.g. 走了, 看着, 去过
elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
finals[-1] = finals[-1][:-1] + "5"
elif (
len(word) > 1
and word[-1] in "们子"
and pos in {"r", "n"}
and word not in self.must_not_neural_tone_words
):
elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
finals[-1] = finals[-1][:-1] + "5"
# e.g. 桌上, 地下, 家里
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
@@ -525,25 +520,18 @@ class ToneSandhi:
finals[-1] = finals[-1][:-1] + "5"
# 个做量词
elif (
ge_idx >= 1
and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
) or word == "":
finals[ge_idx] = finals[ge_idx][:-1] + "5"
else:
if (
word in self.must_neural_tone_words
or word[-2:] in self.must_neural_tone_words
):
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
finals[-1] = finals[-1][:-1] + "5"
word_list = self._split_word(word)
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
for i, word in enumerate(word_list):
# conventional neural in Chinese
if (
word in self.must_neural_tone_words
or word[-2:] in self.must_neural_tone_words
):
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
finals = sum(finals_list, [])
return finals
@@ -561,9 +549,7 @@ class ToneSandhi:
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
# "一" in number sequences, e.g. 一零零, 二一零
if word.find("") != -1 and all(
[item.isnumeric() for item in word if item != ""]
):
if word.find("") != -1 and all([item.isnumeric() for item in word if item != ""]):
return finals
# "一" between reduplication words shold be yi5, e.g. 看一看
elif len(word) == 3 and word[1] == "" and word[0] == word[-1]:
@@ -697,13 +683,10 @@ class ToneSandhi:
return new_seg
# the first and the second words are all_tone_three
def _merge_continuous_three_tones(
self, seg: List[Tuple[str, str]]
) -> List[Tuple[str, str]]:
def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = [
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
for (word, pos) in seg
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
]
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
@@ -715,10 +698,7 @@ class ToneSandhi:
and not merge_last[i - 1]
):
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if (
not self._is_reduplication(seg[i - 1][0])
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
):
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:
@@ -732,13 +712,10 @@ class ToneSandhi:
return len(word) == 2 and word[0] == word[1]
# the last char of first word and the first char of second word is tone_three
def _merge_continuous_three_tones_2(
self, seg: List[Tuple[str, str]]
) -> List[Tuple[str, str]]:
def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
new_seg = []
sub_finals_list = [
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
for (word, pos) in seg
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
]
assert len(sub_finals_list) == len(seg)
merge_last = [False] * len(seg)
@@ -750,10 +727,7 @@ class ToneSandhi:
and not merge_last[i - 1]
):
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
if (
not self._is_reduplication(seg[i - 1][0])
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
):
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
merge_last[i] = True
else:

File diff suppressed because one or more lines are too long

View File

@@ -21,25 +21,29 @@ from .num import verbalize_digit
def _time_num2str(num_string: str) -> str:
"""A special case for verbalizing number in time."""
result = num2str(num_string.lstrip('0'))
if num_string.startswith('0'):
result = DIGITS['0'] + result
result = num2str(num_string.lstrip("0"))
if num_string.startswith("0"):
result = DIGITS["0"] + result
return result
# 时刻表达式
RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?')
RE_TIME = re.compile(
r"([0-1]?[0-9]|2[0-3])"
r":([0-5][0-9])"
r"(:([0-5][0-9]))?"
)
# 时间范围如8:30-12:30
RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?'
r'(~|-)'
r'([0-1]?[0-9]|2[0-3])'
r':([0-5][0-9])'
r'(:([0-5][0-9]))?')
RE_TIME_RANGE = re.compile(
r"([0-1]?[0-9]|2[0-3])"
r":([0-5][0-9])"
r"(:([0-5][0-9]))?"
r"(~|-)"
r"([0-1]?[0-9]|2[0-3])"
r":([0-5][0-9])"
r"(:([0-5][0-9]))?"
)
def replace_time(match) -> str:
@@ -62,31 +66,33 @@ def replace_time(match) -> str:
second_2 = match.group(9)
result = f"{num2str(hour)}"
if minute.lstrip('0'):
if minute.lstrip("0"):
if int(minute) == 30:
result += ""
else:
result += f"{_time_num2str(minute)}"
if second and second.lstrip('0'):
if second and second.lstrip("0"):
result += f"{_time_num2str(second)}"
if is_range:
result += ""
result += f"{num2str(hour_2)}"
if minute_2.lstrip('0'):
if minute_2.lstrip("0"):
if int(minute) == 30:
result += ""
else:
result += f"{_time_num2str(minute_2)}"
if second_2 and second_2.lstrip('0'):
if second_2 and second_2.lstrip("0"):
result += f"{_time_num2str(second_2)}"
return result
RE_DATE = re.compile(r'(\d{4}|\d{2})年'
r'((0?[1-9]|1[0-2])月)?'
r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
RE_DATE = re.compile(
r"(\d{4}|\d{2})年"
r"((0?[1-9]|1[0-2]))?"
r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
)
def replace_date(match) -> str:
@@ -110,8 +116,7 @@ def replace_date(match) -> str:
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
RE_DATE2 = re.compile(
r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
def replace_date2(match) -> str:

View File

@@ -18,10 +18,7 @@ from pypinyin.constants import SUPPORT_UCS4
# 全角半角转换
# 英文字符全角 -> 半角映射表 (num: 52)
F2H_ASCII_LETTERS = {
ord(char) + 65248: ord(char)
for char in string.ascii_letters
}
F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
# 英文字符半角 -> 全角映射表
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
@@ -37,26 +34,29 @@ F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
# 空格 (num: 1)
F2H_SPACE = {'\u3000': ' '}
H2F_SPACE = {' ': '\u3000'}
F2H_SPACE = {"\u3000": " "}
H2F_SPACE = {" ": "\u3000"}
# 非"有拼音的汉字"的字符串可用于NSW提取
if SUPPORT_UCS4:
RE_NSW = re.compile(r'(?:[^'
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
r'])+')
RE_NSW = re.compile(
r"(?:[^"
r"\u3007" #
r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF]
r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F]
r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D]
r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F]
r"])+"
)
else:
RE_NSW = re.compile( # pragma: no cover
r'(?:[^'
r'\u3007' #
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
r'])+')
r"(?:[^"
r"\u3007" #
r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
r"])+"
)

View File

@@ -15,23 +15,26 @@
Rules to verbalize numbers into Chinese characters.
https://zh.wikipedia.org/wiki/中文数字#現代中文
"""
import re
from collections import OrderedDict
from typing import List
DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
UNITS = OrderedDict({
1: '',
2: '',
3: '',
4: '',
8: '亿',
})
DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
UNITS = OrderedDict(
{
1: "",
2: "",
3: "",
4: "",
8: "亿",
}
)
COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
# 分数表达式
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
def replace_frac(match) -> str:
@@ -52,7 +55,7 @@ def replace_frac(match) -> str:
# 百分数表达式
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
def replace_percentage(match) -> str:
@@ -72,7 +75,7 @@ def replace_percentage(match) -> str:
# 整数表达式
# 带负号的整数 -10
RE_INTEGER = re.compile(r'(-)' r'(\d+)')
RE_INTEGER = re.compile(r"(-)" r"(\d+)")
def replace_negative_num(match) -> str:
@@ -92,7 +95,7 @@ def replace_negative_num(match) -> str:
# 编号-无符号整形
# 00078
RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
def replace_default_num(match):
@@ -110,15 +113,11 @@ def replace_default_num(match):
# RE_ASMD = re.compile(
# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
RE_ASMD = re.compile(
r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))')
r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
)
asmd_map = {"+": "", "-": "", "×": "", "÷": "", "=": "等于"}
asmd_map = {
'+': '',
'-': '',
'×': '',
'÷': '',
'=': '等于'
}
def replace_asmd(match) -> str:
"""
@@ -132,24 +131,25 @@ def replace_asmd(match) -> str:
# 次方专项
RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+')
RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
power_map = {
'': '0',
'¹': '1',
'²': '2',
'³': '3',
'': '4',
'': '5',
'': '6',
'': '7',
'': '8',
'': '9',
'ˣ': 'x',
'ʸ': 'y',
'': 'n'
"": "0",
"¹": "1",
"²": "2",
"³": "3",
"": "4",
"": "5",
"": "6",
"": "7",
"": "8",
"": "9",
"ˣ": "x",
"ʸ": "y",
"": "n",
}
def replace_power(match) -> str:
"""
Args:
@@ -166,10 +166,10 @@ def replace_power(match) -> str:
# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
# 正整数 + 量词
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
def replace_positive_quantifier(match) -> str:
@@ -220,7 +220,9 @@ RE_RANGE = re.compile(
[-~] # 匹配范围分隔符
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
""", re.VERBOSE)
""",
re.VERBOSE,
)
def replace_range(match) -> str:
@@ -239,7 +241,9 @@ def replace_range(match) -> str:
# ~至表达式
RE_TO_RANGE = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
)
def replace_to_range(match) -> str:
"""
@@ -248,71 +252,66 @@ def replace_to_range(match) -> str:
Returns:
str
"""
result = match.group(0).replace('~', '')
result = match.group(0).replace("~", "")
return result
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
stripped = value_string.lstrip('0')
def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
stripped = value_string.lstrip("0")
if len(stripped) == 0:
return []
elif len(stripped) == 1:
if use_zero and len(stripped) < len(value_string):
return [DIGITS['0'], DIGITS[stripped]]
return [DIGITS["0"], DIGITS[stripped]]
else:
return [DIGITS[stripped]]
else:
largest_unit = next(
power for power in reversed(UNITS.keys()) if power < len(stripped))
largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
first_part = value_string[:-largest_unit]
second_part = value_string[-largest_unit:]
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
second_part)
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
def verbalize_cardinal(value_string: str) -> str:
if not value_string:
return ''
return ""
# 000 -> '零' , 0 -> '零'
value_string = value_string.lstrip('0')
value_string = value_string.lstrip("0")
if len(value_string) == 0:
return DIGITS['0']
return DIGITS["0"]
result_symbols = _get_value(value_string)
# verbalized number starting with '一十*' is abbreviated as `十*`
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
'1'] and result_symbols[1] == UNITS[1]:
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]:
result_symbols = result_symbols[1:]
return ''.join(result_symbols)
return "".join(result_symbols)
def verbalize_digit(value_string: str, alt_one=False) -> str:
result_symbols = [DIGITS[digit] for digit in value_string]
result = ''.join(result_symbols)
result = "".join(result_symbols)
if alt_one:
result = result.replace("", "")
return result
def num2str(value_string: str) -> str:
integer_decimal = value_string.split('.')
integer_decimal = value_string.split(".")
if len(integer_decimal) == 1:
integer = integer_decimal[0]
decimal = ''
decimal = ""
elif len(integer_decimal) == 2:
integer, decimal = integer_decimal
else:
raise ValueError(
f"The value string: '${value_string}' has more than one point in it."
)
raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
result = verbalize_cardinal(integer)
decimal = decimal.rstrip('0')
decimal = decimal.rstrip("0")
if decimal:
# '.22' is verbalized as '零点二二'
# '3.20' is verbalized as '三点二
result = result if result else ""
result += '' + verbalize_digit(decimal)
result += "" + verbalize_digit(decimal)
return result

View File

@@ -21,10 +21,8 @@ from .num import verbalize_digit
# 移动139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
# 联通130、131、132、156、155、186、185、176
# 电信133、153、189、180、181、177
RE_MOBILE_PHONE = re.compile(
r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
RE_TELEPHONE = re.compile(
r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
RE_MOBILE_PHONE = re.compile(r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
RE_TELEPHONE = re.compile(r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
# 全国统一的号码400开头
RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
@@ -32,14 +30,12 @@ RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
def phone2str(phone_string: str, mobile=True) -> str:
if mobile:
sp_parts = phone_string.strip('+').split()
result = ''.join(
[verbalize_digit(part, alt_one=True) for part in sp_parts])
sp_parts = phone_string.strip("+").split()
result = "".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
return result
else:
sil_parts = phone_string.split('-')
result = ''.join(
[verbalize_digit(part, alt_one=True) for part in sil_parts])
sil_parts = phone_string.split("-")
result = "".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
return result

View File

@@ -17,7 +17,7 @@ from .num import num2str
# 温度表达式,温度会影响负号的读法
# -3°C 零下三度
RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)")
measure_dict = {
"cm2": "平方厘米",
"cm²": "平方厘米",
@@ -35,7 +35,7 @@ measure_dict = {
"ml": "毫升",
"m": "",
"mm": "毫米",
"s": ""
"s": "",
}

View File

@@ -56,9 +56,9 @@ from .quantifier import replace_measure
from .quantifier import replace_temperature
class TextNormalizer():
class TextNormalizer:
def __init__(self):
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
self.SENTENCE_SPLITOR = re.compile(r"([:、,;。?!,;?!][”’]?)")
def _split(self, text: str, lang="zh") -> List[str]:
"""Split long text into sentences with sentence-splitting punctuations.
@@ -71,66 +71,64 @@ class TextNormalizer():
if lang == "zh":
text = text.replace(" ", "")
# 过滤掉特殊字符
text = re.sub(r'[——《》【】<>{}()#&@“”^_|\\]', '', text)
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = re.sub(r"[——《》【】<>{}()#&@“”^_|\\]", "", text)
text = self.SENTENCE_SPLITOR.sub(r"\1\n", text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
sentences = [sentence.strip() for sentence in re.split(r"\n+", text)]
return sentences
def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('/', '')
sentence = sentence.replace("/", "")
# sentence = sentence.replace('~', '至')
# sentence = sentence.replace('', '至')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('', '')
sentence = sentence.replace('α', '阿尔法')
sentence = sentence.replace('β', '贝塔')
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
sentence = sentence.replace('ε', '艾普西龙')
sentence = sentence.replace('ζ', '捷塔')
sentence = sentence.replace('η', '依塔')
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
sentence = sentence.replace('ι', '艾欧塔')
sentence = sentence.replace('κ', '喀帕')
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
sentence = sentence.replace('μ', '')
sentence = sentence.replace('ν', '')
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
sentence = sentence.replace('ο', '欧米克伦')
sentence = sentence.replace('π', '').replace('Π', '')
sentence = sentence.replace('ρ', '')
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
'σ', '西格玛')
sentence = sentence.replace('τ', '')
sentence = sentence.replace('υ', '宇普西龙')
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
sentence = sentence.replace('χ', '')
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("", "")
sentence = sentence.replace("α", "阿尔法")
sentence = sentence.replace("β", "贝塔")
sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛")
sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔")
sentence = sentence.replace("ε", "艾普西龙")
sentence = sentence.replace("ζ", "捷塔")
sentence = sentence.replace("η", "依塔")
sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔")
sentence = sentence.replace("ι", "艾欧塔")
sentence = sentence.replace("κ", "喀帕")
sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达")
sentence = sentence.replace("μ", "")
sentence = sentence.replace("ν", "")
sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西")
sentence = sentence.replace("ο", "欧米克伦")
sentence = sentence.replace("π", "").replace("Π", "")
sentence = sentence.replace("ρ", "")
sentence = sentence.replace("ς", "西格玛").replace("Σ", "西格玛").replace("σ", "西格玛")
sentence = sentence.replace("τ", "")
sentence = sentence.replace("υ", "宇普西龙")
sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾")
sentence = sentence.replace("χ", "")
sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛")
sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽")
# 兜底数学运算,顺便兼容懒人用语
sentence = sentence.replace('+', '')
sentence = sentence.replace('-', '')
sentence = sentence.replace('×', '')
sentence = sentence.replace('÷', '')
sentence = sentence.replace('=', '')
sentence = sentence.replace("+", "")
sentence = sentence.replace("-", "")
sentence = sentence.replace("×", "")
sentence = sentence.replace("÷", "")
sentence = sentence.replace("=", "")
# re filter special characters, have one more character "-" than line 68
sentence = re.sub(r'[-——《》【】<=>{}()#&@“”^_|\\]', '', sentence)
sentence = re.sub(r"[-——《》【】<=>{}()#&@“”^_|\\]", "", sentence)
return sentence
def normalize_sentence(self, sentence: str) -> str:
# basic character conversions
sentence = tranditional_to_simplified(sentence)
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
F2H_DIGITS).translate(F2H_SPACE)
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(F2H_DIGITS).translate(F2H_SPACE)
# number related NSW verbalization
sentence = RE_DATE.sub(replace_date, sentence)
@@ -161,8 +159,7 @@ class TextNormalizer():
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
sentence = RE_NUMBER.sub(replace_number, sentence)
sentence = self._post_replace(sentence)