Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)
* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
This commit is contained in:
@@ -1 +1 @@
|
||||
from .langsegmenter import LangSegmenter
|
||||
from .langsegmenter import LangSegmenter
|
||||
|
||||
@@ -3,38 +3,44 @@ import re
|
||||
|
||||
# jieba静音
|
||||
import jieba
|
||||
|
||||
jieba.setLogLevel(logging.CRITICAL)
|
||||
|
||||
# 更改fast_langdetect大模型位置
|
||||
from pathlib import Path
|
||||
import fast_langdetect
|
||||
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(fast_langdetect.infer.LangDetectConfig(cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"))
|
||||
|
||||
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector(
|
||||
fast_langdetect.infer.LangDetectConfig(
|
||||
cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
from split_lang import LangSplitter
|
||||
|
||||
|
||||
def full_en(text):
|
||||
pattern = r'^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$'
|
||||
pattern = r"^[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$"
|
||||
return bool(re.match(pattern, text))
|
||||
|
||||
|
||||
def full_cjk(text):
|
||||
# 来自wiki
|
||||
cjk_ranges = [
|
||||
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
||||
(0x3400, 0x4DB5), # CJK Extension A
|
||||
(0x20000, 0x2A6DD), # CJK Extension B
|
||||
(0x2A700, 0x2B73F), # CJK Extension C
|
||||
(0x2B740, 0x2B81F), # CJK Extension D
|
||||
(0x2B820, 0x2CEAF), # CJK Extension E
|
||||
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
||||
(0x30000, 0x3134A), # CJK Extension G
|
||||
(0x31350, 0x323AF), # CJK Extension H
|
||||
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
||||
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
||||
(0x3400, 0x4DB5), # CJK Extension A
|
||||
(0x20000, 0x2A6DD), # CJK Extension B
|
||||
(0x2A700, 0x2B73F), # CJK Extension C
|
||||
(0x2B740, 0x2B81F), # CJK Extension D
|
||||
(0x2B820, 0x2CEAF), # CJK Extension E
|
||||
(0x2CEB0, 0x2EBEF), # CJK Extension F
|
||||
(0x30000, 0x3134A), # CJK Extension G
|
||||
(0x31350, 0x323AF), # CJK Extension H
|
||||
(0x2EBF0, 0x2EE5D), # CJK Extension H
|
||||
]
|
||||
|
||||
pattern = r'[0-9、-〜。!?.!?… ]+$'
|
||||
pattern = r"[0-9、-〜。!?.!?… ]+$"
|
||||
|
||||
cjk_text = ""
|
||||
for char in text:
|
||||
@@ -45,7 +51,7 @@ def full_cjk(text):
|
||||
return cjk_text
|
||||
|
||||
|
||||
def split_jako(tag_lang,item):
|
||||
def split_jako(tag_lang, item):
|
||||
if tag_lang == "ja":
|
||||
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)"
|
||||
else:
|
||||
@@ -53,41 +59,40 @@ def split_jako(tag_lang,item):
|
||||
|
||||
lang_list: list[dict] = []
|
||||
tag = 0
|
||||
for match in re.finditer(pattern, item['text']):
|
||||
for match in re.finditer(pattern, item["text"]):
|
||||
if match.start() > tag:
|
||||
lang_list.append({'lang':item['lang'],'text':item['text'][tag:match.start()]})
|
||||
lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]})
|
||||
|
||||
tag = match.end()
|
||||
lang_list.append({'lang':tag_lang,'text':item['text'][match.start():match.end()]})
|
||||
lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]})
|
||||
|
||||
if tag < len(item['text']):
|
||||
lang_list.append({'lang':item['lang'],'text':item['text'][tag:len(item['text'])]})
|
||||
if tag < len(item["text"]):
|
||||
lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]})
|
||||
|
||||
return lang_list
|
||||
|
||||
|
||||
def merge_lang(lang_list, item):
|
||||
if lang_list and item['lang'] == lang_list[-1]['lang']:
|
||||
lang_list[-1]['text'] += item['text']
|
||||
if lang_list and item["lang"] == lang_list[-1]["lang"]:
|
||||
lang_list[-1]["text"] += item["text"]
|
||||
else:
|
||||
lang_list.append(item)
|
||||
return lang_list
|
||||
|
||||
|
||||
class LangSegmenter():
|
||||
class LangSegmenter:
|
||||
# 默认过滤器, 基于gsv目前四种语言
|
||||
DEFAULT_LANG_MAP = {
|
||||
"zh": "zh",
|
||||
"yue": "zh", # 粤语
|
||||
"wuu": "zh", # 吴语
|
||||
"zh-cn": "zh",
|
||||
"zh-tw": "x", # 繁体设置为x
|
||||
"zh-tw": "x", # 繁体设置为x
|
||||
"ko": "ko",
|
||||
"ja": "ja",
|
||||
"en": "en",
|
||||
}
|
||||
|
||||
|
||||
def getTexts(text):
|
||||
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP)
|
||||
substr = lang_splitter.split_by_lang(text=text)
|
||||
@@ -95,18 +100,18 @@ class LangSegmenter():
|
||||
lang_list: list[dict] = []
|
||||
|
||||
for _, item in enumerate(substr):
|
||||
dict_item = {'lang':item.lang,'text':item.text}
|
||||
dict_item = {"lang": item.lang, "text": item.text}
|
||||
|
||||
# 处理短英文被识别为其他语言的问题
|
||||
if full_en(dict_item['text']):
|
||||
dict_item['lang'] = 'en'
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
if full_en(dict_item["text"]):
|
||||
dict_item["lang"] = "en"
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
continue
|
||||
|
||||
# 处理非日语夹日文的问题(不包含CJK)
|
||||
ja_list: list[dict] = []
|
||||
if dict_item['lang'] != 'ja':
|
||||
ja_list = split_jako('ja',dict_item)
|
||||
if dict_item["lang"] != "ja":
|
||||
ja_list = split_jako("ja", dict_item)
|
||||
|
||||
if not ja_list:
|
||||
ja_list.append(dict_item)
|
||||
@@ -115,8 +120,8 @@ class LangSegmenter():
|
||||
ko_list: list[dict] = []
|
||||
temp_list: list[dict] = []
|
||||
for _, ko_item in enumerate(ja_list):
|
||||
if ko_item["lang"] != 'ko':
|
||||
ko_list = split_jako('ko',ko_item)
|
||||
if ko_item["lang"] != "ko":
|
||||
ko_list = split_jako("ko", ko_item)
|
||||
|
||||
if ko_list:
|
||||
temp_list.extend(ko_list)
|
||||
@@ -126,28 +131,28 @@ class LangSegmenter():
|
||||
# 未存在非日韩文夹日韩文
|
||||
if len(temp_list) == 1:
|
||||
# 未知语言检查是否为CJK
|
||||
if dict_item['lang'] == 'x':
|
||||
cjk_text = full_cjk(dict_item['text'])
|
||||
if dict_item["lang"] == "x":
|
||||
cjk_text = full_cjk(dict_item["text"])
|
||||
if cjk_text:
|
||||
dict_item = {'lang':'zh','text':cjk_text}
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
dict_item = {"lang": "zh", "text": cjk_text}
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
continue
|
||||
else:
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
continue
|
||||
|
||||
# 存在非日韩文夹日韩文
|
||||
for _, temp_item in enumerate(temp_list):
|
||||
# 未知语言检查是否为CJK
|
||||
if temp_item['lang'] == 'x':
|
||||
cjk_text = full_cjk(dict_item['text'])
|
||||
if temp_item["lang"] == "x":
|
||||
cjk_text = full_cjk(dict_item["text"])
|
||||
if cjk_text:
|
||||
dict_item = {'lang':'zh','text':cjk_text}
|
||||
lang_list = merge_lang(lang_list,dict_item)
|
||||
dict_item = {"lang": "zh", "text": cjk_text}
|
||||
lang_list = merge_lang(lang_list, dict_item)
|
||||
else:
|
||||
lang_list = merge_lang(lang_list,temp_item)
|
||||
lang_list = merge_lang(lang_list, temp_item)
|
||||
return lang_list
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "MyGO?,你也喜欢まいご吗?"
|
||||
@@ -155,4 +160,3 @@ if __name__ == "__main__":
|
||||
|
||||
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。"
|
||||
print(LangSegmenter.getTexts(text))
|
||||
|
||||
|
||||
@@ -10,18 +10,19 @@ from text import symbols2 as symbols_v2
|
||||
_symbol_to_id_v1 = {s: i for i, s in enumerate(symbols_v1.symbols)}
|
||||
_symbol_to_id_v2 = {s: i for i, s in enumerate(symbols_v2.symbols)}
|
||||
|
||||
|
||||
def cleaned_text_to_sequence(cleaned_text, version=None):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
if version is None:version=os.environ.get('version', 'v2')
|
||||
if version == "v1":
|
||||
phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
|
||||
else:
|
||||
phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
|
||||
|
||||
return phones
|
||||
"""
|
||||
if version is None:
|
||||
version = os.environ.get("version", "v2")
|
||||
if version == "v1":
|
||||
phones = [_symbol_to_id_v1[symbol] for symbol in cleaned_text]
|
||||
else:
|
||||
phones = [_symbol_to_id_v2[symbol] for symbol in cleaned_text]
|
||||
|
||||
return phones
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py
|
||||
|
||||
import sys
|
||||
import re
|
||||
import cn2an
|
||||
import ToJyutping
|
||||
@@ -99,9 +98,7 @@ def replace_punctuation(text):
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
||||
|
||||
return replaced_text
|
||||
|
||||
@@ -115,7 +112,9 @@ def text_normalize(text):
|
||||
return dest_text
|
||||
|
||||
|
||||
punctuation_set=set(punctuation)
|
||||
punctuation_set = set(punctuation)
|
||||
|
||||
|
||||
def jyuping_to_initials_finals_tones(jyuping_syllables):
|
||||
initials_finals = []
|
||||
tones = []
|
||||
@@ -160,12 +159,14 @@ def jyuping_to_initials_finals_tones(jyuping_syllables):
|
||||
assert len(initials_finals) == len(tones)
|
||||
|
||||
###魔改为辅音+带音调的元音
|
||||
phones=[]
|
||||
for a,b in zip(initials_finals,tones):
|
||||
if(b not in [-1,0]):###防止粤语和普通话重合开头加Y,如果是标点,不加。
|
||||
todo="%s%s"%(a,b)
|
||||
else:todo=a
|
||||
if(todo not in punctuation_set):todo="Y%s"%todo
|
||||
phones = []
|
||||
for a, b in zip(initials_finals, tones):
|
||||
if b not in [-1, 0]: ###防止粤语和普通话重合开头加Y,如果是标点,不加。
|
||||
todo = "%s%s" % (a, b)
|
||||
else:
|
||||
todo = a
|
||||
if todo not in punctuation_set:
|
||||
todo = "Y%s" % todo
|
||||
phones.append(todo)
|
||||
|
||||
# return initials_finals, tones, word2ph
|
||||
@@ -218,4 +219,4 @@ if __name__ == "__main__":
|
||||
# phones, tones, word2ph = g2p(text)
|
||||
phones, word2ph = g2p(text)
|
||||
# print(phones, tones, word2ph)
|
||||
print(phones, word2ph)
|
||||
print(phones, word2ph)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import pdb
|
||||
import re
|
||||
|
||||
import cn2an
|
||||
@@ -17,7 +16,9 @@ pinyin_to_symbol_map = {
|
||||
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
||||
}
|
||||
|
||||
import jieba_fast, logging
|
||||
import jieba_fast
|
||||
import logging
|
||||
|
||||
jieba_fast.setLogLevel(logging.CRITICAL)
|
||||
import jieba_fast.posseg as psg
|
||||
|
||||
@@ -37,7 +38,7 @@ rep_map = {
|
||||
"/": ",",
|
||||
"—": "-",
|
||||
"~": "…",
|
||||
"~":"…",
|
||||
"~": "…",
|
||||
}
|
||||
|
||||
tone_modifier = ToneSandhi()
|
||||
@@ -49,9 +50,7 @@ def replace_punctuation(text):
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
||||
|
||||
return replaced_text
|
||||
|
||||
@@ -62,17 +61,15 @@ def replace_punctuation_with_en(text):
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
|
||||
|
||||
return replaced_text
|
||||
|
||||
|
||||
def replace_consecutive_punctuation(text):
|
||||
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||
pattern = f'([{punctuations}])([{punctuations}])+'
|
||||
result = re.sub(pattern, r'\1', text)
|
||||
punctuations = "".join(re.escape(p) for p in punctuation)
|
||||
pattern = f"([{punctuations}])([{punctuations}])+"
|
||||
result = re.sub(pattern, r"\1", text)
|
||||
return result
|
||||
|
||||
|
||||
@@ -87,9 +84,7 @@ def _get_initials_finals(word):
|
||||
initials = []
|
||||
finals = []
|
||||
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||
orig_finals = lazy_pinyin(
|
||||
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
|
||||
)
|
||||
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
for c, v in zip(orig_initials, orig_finals):
|
||||
initials.append(c)
|
||||
finals.append(v)
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
import os
|
||||
import pdb
|
||||
import re
|
||||
|
||||
import cn2an
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from pypinyin.contrib.tone_convert import to_normal, to_finals_tone3, to_initials, to_finals
|
||||
from pypinyin.contrib.tone_convert import to_finals_tone3, to_initials
|
||||
|
||||
from text.symbols import punctuation
|
||||
from text.tone_sandhi import ToneSandhi
|
||||
@@ -18,18 +17,26 @@ pinyin_to_symbol_map = {
|
||||
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
||||
}
|
||||
|
||||
import jieba_fast, logging
|
||||
import jieba_fast
|
||||
import logging
|
||||
|
||||
jieba_fast.setLogLevel(logging.CRITICAL)
|
||||
import jieba_fast.posseg as psg
|
||||
|
||||
# is_g2pw_str = os.environ.get("is_g2pw", "True")##默认开启
|
||||
# is_g2pw = False#True if is_g2pw_str.lower() == 'true' else False
|
||||
is_g2pw = True#True if is_g2pw_str.lower() == 'true' else False
|
||||
is_g2pw = True # True if is_g2pw_str.lower() == 'true' else False
|
||||
if is_g2pw:
|
||||
# print("当前使用g2pw进行拼音推理")
|
||||
from text.g2pw import G2PWPinyin, correct_pronunciation
|
||||
|
||||
parent_directory = os.path.dirname(current_file_path)
|
||||
g2pw = G2PWPinyin(model_dir="GPT_SoVITS/text/G2PWModel",model_source=os.environ.get("bert_path","GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),v_to_u=False, neutral_tone_with_five=True)
|
||||
g2pw = G2PWPinyin(
|
||||
model_dir="GPT_SoVITS/text/G2PWModel",
|
||||
model_source=os.environ.get("bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"),
|
||||
v_to_u=False,
|
||||
neutral_tone_with_five=True,
|
||||
)
|
||||
|
||||
rep_map = {
|
||||
":": ",",
|
||||
@@ -46,7 +53,7 @@ rep_map = {
|
||||
"/": ",",
|
||||
"—": "-",
|
||||
"~": "…",
|
||||
"~":"…",
|
||||
"~": "…",
|
||||
}
|
||||
|
||||
tone_modifier = ToneSandhi()
|
||||
@@ -58,9 +65,7 @@ def replace_punctuation(text):
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
replaced_text = re.sub(r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text)
|
||||
|
||||
return replaced_text
|
||||
|
||||
@@ -77,9 +82,7 @@ def _get_initials_finals(word):
|
||||
finals = []
|
||||
|
||||
orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
|
||||
orig_finals = lazy_pinyin(
|
||||
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
|
||||
)
|
||||
orig_finals = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
|
||||
for c, v in zip(orig_initials, orig_finals):
|
||||
initials.append(c)
|
||||
@@ -87,31 +90,66 @@ def _get_initials_finals(word):
|
||||
return initials, finals
|
||||
|
||||
|
||||
must_erhua = {
|
||||
"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
|
||||
}
|
||||
must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"}
|
||||
not_erhua = {
|
||||
"虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
|
||||
"拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
|
||||
"流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
|
||||
"孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
|
||||
"狗儿", "少儿"
|
||||
"虐儿",
|
||||
"为儿",
|
||||
"护儿",
|
||||
"瞒儿",
|
||||
"救儿",
|
||||
"替儿",
|
||||
"有儿",
|
||||
"一儿",
|
||||
"我儿",
|
||||
"俺儿",
|
||||
"妻儿",
|
||||
"拐儿",
|
||||
"聋儿",
|
||||
"乞儿",
|
||||
"患儿",
|
||||
"幼儿",
|
||||
"孤儿",
|
||||
"婴儿",
|
||||
"婴幼儿",
|
||||
"连体儿",
|
||||
"脑瘫儿",
|
||||
"流浪儿",
|
||||
"体弱儿",
|
||||
"混血儿",
|
||||
"蜜雪儿",
|
||||
"舫儿",
|
||||
"祖儿",
|
||||
"美儿",
|
||||
"应采儿",
|
||||
"可儿",
|
||||
"侄儿",
|
||||
"孙儿",
|
||||
"侄孙儿",
|
||||
"女儿",
|
||||
"男儿",
|
||||
"红孩儿",
|
||||
"花儿",
|
||||
"虫儿",
|
||||
"马儿",
|
||||
"鸟儿",
|
||||
"猪儿",
|
||||
"猫儿",
|
||||
"狗儿",
|
||||
"少儿",
|
||||
}
|
||||
def _merge_erhua(initials: list[str],
|
||||
finals: list[str],
|
||||
word: str,
|
||||
pos: str) -> list[list[str]]:
|
||||
|
||||
|
||||
def _merge_erhua(initials: list[str], finals: list[str], word: str, pos: str) -> list[list[str]]:
|
||||
"""
|
||||
Do erhub.
|
||||
"""
|
||||
# fix er1
|
||||
for i, phn in enumerate(finals):
|
||||
if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
|
||||
finals[i] = 'er2'
|
||||
if i == len(finals) - 1 and word[i] == "儿" and phn == "er1":
|
||||
finals[i] = "er2"
|
||||
|
||||
# 发音
|
||||
if word not in must_erhua and (word in not_erhua or
|
||||
pos in {"a", "j", "nr"}):
|
||||
if word not in must_erhua and (word in not_erhua or pos in {"a", "j", "nr"}):
|
||||
return initials, finals
|
||||
|
||||
# "……" 等情况直接返回
|
||||
@@ -124,9 +162,13 @@ def _merge_erhua(initials: list[str],
|
||||
new_initials = []
|
||||
new_finals = []
|
||||
for i, phn in enumerate(finals):
|
||||
if i == len(finals) - 1 and word[i] == "儿" and phn in {
|
||||
"er2", "er5"
|
||||
} and word[-2:] not in not_erhua and new_finals:
|
||||
if (
|
||||
i == len(finals) - 1
|
||||
and word[i] == "儿"
|
||||
and phn in {"er2", "er5"}
|
||||
and word[-2:] not in not_erhua
|
||||
and new_finals
|
||||
):
|
||||
phn = "er" + new_finals[-1][-1]
|
||||
|
||||
new_initials.append(initials[i])
|
||||
@@ -160,7 +202,7 @@ def _g2p(segments):
|
||||
# assert len(sub_initials) == len(sub_finals) == len(word)
|
||||
initials = sum(initials, [])
|
||||
finals = sum(finals, [])
|
||||
print("pypinyin结果",initials,finals)
|
||||
print("pypinyin结果", initials, finals)
|
||||
else:
|
||||
# g2pw采用整句推理
|
||||
pinyins = g2pw.lazy_pinyin(seg, neutral_tone_with_five=True, style=Style.TONE3)
|
||||
@@ -171,19 +213,19 @@ def _g2p(segments):
|
||||
sub_finals = []
|
||||
now_word_length = pre_word_length + len(word)
|
||||
|
||||
if pos == 'eng':
|
||||
if pos == "eng":
|
||||
pre_word_length = now_word_length
|
||||
continue
|
||||
|
||||
word_pinyins = pinyins[pre_word_length:now_word_length]
|
||||
|
||||
# 多音字消歧
|
||||
word_pinyins = correct_pronunciation(word,word_pinyins)
|
||||
word_pinyins = correct_pronunciation(word, word_pinyins)
|
||||
|
||||
for pinyin in word_pinyins:
|
||||
if pinyin[0].isalpha():
|
||||
sub_initials.append(to_initials(pinyin))
|
||||
sub_finals.append(to_finals_tone3(pinyin,neutral_tone_with_five=True))
|
||||
sub_finals.append(to_finals_tone3(pinyin, neutral_tone_with_five=True))
|
||||
else:
|
||||
sub_initials.append(pinyin)
|
||||
sub_finals.append(pinyin)
|
||||
@@ -259,18 +301,18 @@ def replace_punctuation_with_en(text):
|
||||
|
||||
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
||||
|
||||
replaced_text = re.sub(
|
||||
r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text
|
||||
)
|
||||
replaced_text = re.sub(r"[^\u4e00-\u9fa5A-Za-z" + "".join(punctuation) + r"]+", "", replaced_text)
|
||||
|
||||
return replaced_text
|
||||
|
||||
|
||||
def replace_consecutive_punctuation(text):
|
||||
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||
pattern = f'([{punctuations}])([{punctuations}])+'
|
||||
result = re.sub(pattern, r'\1', text)
|
||||
punctuations = "".join(re.escape(p) for p in punctuation)
|
||||
pattern = f"([{punctuations}])([{punctuations}])+"
|
||||
result = re.sub(pattern, r"\1", text)
|
||||
return result
|
||||
|
||||
|
||||
def text_normalize(text):
|
||||
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
||||
tx = TextNormalizer()
|
||||
@@ -283,6 +325,7 @@ def text_normalize(text):
|
||||
dest_text = replace_consecutive_punctuation(dest_text)
|
||||
return dest_text
|
||||
|
||||
|
||||
# 不排除英文的文本格式化
|
||||
def mix_text_normalize(text):
|
||||
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
|
||||
|
||||
@@ -19,55 +19,57 @@ special = [
|
||||
|
||||
|
||||
def clean_text(text, language, version=None):
|
||||
if version is None:version=os.environ.get('version', 'v2')
|
||||
if version is None:
|
||||
version = os.environ.get("version", "v2")
|
||||
if version == "v1":
|
||||
symbols = symbols_v1.symbols
|
||||
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
|
||||
else:
|
||||
symbols = symbols_v2.symbols
|
||||
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"}
|
||||
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
|
||||
|
||||
if(language not in language_module_map):
|
||||
language="en"
|
||||
text=" "
|
||||
if language not in language_module_map:
|
||||
language = "en"
|
||||
text = " "
|
||||
for special_s, special_l, target_symbol in special:
|
||||
if special_s in text and language == special_l:
|
||||
return clean_special(text, language, special_s, target_symbol, version)
|
||||
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]])
|
||||
if hasattr(language_module,"text_normalize"):
|
||||
language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
|
||||
if hasattr(language_module, "text_normalize"):
|
||||
norm_text = language_module.text_normalize(text)
|
||||
else:
|
||||
norm_text=text
|
||||
if language == "zh" or language=="yue":##########
|
||||
norm_text = text
|
||||
if language == "zh" or language == "yue": ##########
|
||||
phones, word2ph = language_module.g2p(norm_text)
|
||||
assert len(phones) == sum(word2ph)
|
||||
assert len(norm_text) == len(word2ph)
|
||||
elif language == "en":
|
||||
phones = language_module.g2p(norm_text)
|
||||
if len(phones) < 4:
|
||||
phones = [','] + phones
|
||||
phones = [","] + phones
|
||||
word2ph = None
|
||||
else:
|
||||
phones = language_module.g2p(norm_text)
|
||||
word2ph = None
|
||||
phones = ['UNK' if ph not in symbols else ph for ph in phones]
|
||||
phones = ["UNK" if ph not in symbols else ph for ph in phones]
|
||||
return phones, word2ph, norm_text
|
||||
|
||||
|
||||
def clean_special(text, language, special_s, target_symbol, version=None):
|
||||
if version is None:version=os.environ.get('version', 'v2')
|
||||
if version is None:
|
||||
version = os.environ.get("version", "v2")
|
||||
if version == "v1":
|
||||
symbols = symbols_v1.symbols
|
||||
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"}
|
||||
else:
|
||||
symbols = symbols_v2.symbols
|
||||
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"}
|
||||
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean", "yue": "cantonese"}
|
||||
|
||||
"""
|
||||
特殊静音段sp符号处理
|
||||
"""
|
||||
text = text.replace(special_s, ",")
|
||||
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]])
|
||||
language_module = __import__("text." + language_module_map[language], fromlist=[language_module_map[language]])
|
||||
norm_text = language_module.text_normalize(text)
|
||||
phones = language_module.g2p(norm_text)
|
||||
new_ph = []
|
||||
@@ -81,8 +83,9 @@ def clean_special(text, language, special_s, target_symbol, version=None):
|
||||
|
||||
|
||||
def text_to_sequence(text, language, version=None):
|
||||
version = os.environ.get('version',version)
|
||||
if version is None:version='v2'
|
||||
version = os.environ.get("version", version)
|
||||
if version is None:
|
||||
version = "v2"
|
||||
phones = clean_text(text)
|
||||
return cleaned_text_to_sequence(phones, version)
|
||||
|
||||
|
||||
@@ -9,17 +9,17 @@ import unicodedata
|
||||
# 后缀计量单位替换表
|
||||
measurement_map = {
|
||||
"m": ["meter", "meters"],
|
||||
'km': ["kilometer", "kilometers"],
|
||||
"km": ["kilometer", "kilometers"],
|
||||
"km/h": ["kilometer per hour", "kilometers per hour"],
|
||||
"ft": ["feet", "feet"],
|
||||
"L": ["liter", "liters"],
|
||||
"tbsp": ["tablespoon", "tablespoons"],
|
||||
'tsp': ["teaspoon", "teaspoons"],
|
||||
"tsp": ["teaspoon", "teaspoons"],
|
||||
"h": ["hour", "hours"],
|
||||
"min": ["minute", "minutes"],
|
||||
"s": ["second", "seconds"],
|
||||
"°C": ["degree celsius", "degrees celsius"],
|
||||
"°F": ["degree fahrenheit", "degrees fahrenheit"]
|
||||
"°F": ["degree fahrenheit", "degrees fahrenheit"],
|
||||
}
|
||||
|
||||
|
||||
@@ -27,41 +27,42 @@ measurement_map = {
|
||||
_inflect = inflect.engine()
|
||||
|
||||
# 转化数字序数词
|
||||
_ordinal_number_re = re.compile(r'\b([0-9]+)\. ')
|
||||
_ordinal_number_re = re.compile(r"\b([0-9]+)\. ")
|
||||
|
||||
# 我听说好像对于数字正则识别其实用 \d 会好一点
|
||||
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
||||
|
||||
# 时间识别
|
||||
_time_re = re.compile(r'\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b')
|
||||
_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b")
|
||||
|
||||
# 后缀计量单位识别
|
||||
_measurement_re = re.compile(r'\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b')
|
||||
_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b")
|
||||
|
||||
# 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ )
|
||||
_pounds_re_start = re.compile(r'£([0-9\.\,]*[0-9]+)')
|
||||
_pounds_re_end = re.compile(r'([0-9\.\,]*[0-9]+)£')
|
||||
_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)")
|
||||
_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£")
|
||||
|
||||
# 前后 $ 识别
|
||||
_dollars_re_start = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_dollars_re_end = re.compile(r'([(0-9\.\,]*[0-9]+)\$')
|
||||
_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
||||
_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$")
|
||||
|
||||
# 小数的识别
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.\s*[0-9]+)')
|
||||
_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)")
|
||||
|
||||
# 分数识别 (形式 "3/4" )
|
||||
_fraction_re = re.compile(r'([0-9]+/[0-9]+)')
|
||||
_fraction_re = re.compile(r"([0-9]+/[0-9]+)")
|
||||
|
||||
# 序数词识别
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
||||
|
||||
# 数字处理
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
_number_re = re.compile(r"[0-9]+")
|
||||
|
||||
|
||||
def _convert_ordinal(m):
|
||||
"""
|
||||
标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
|
||||
标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
|
||||
Examples:
|
||||
input: "1. "
|
||||
output: "1st"
|
||||
@@ -70,24 +71,26 @@ def _convert_ordinal(m):
|
||||
ordinal = _inflect.ordinal(m.group(1))
|
||||
return ordinal + ", "
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
return m.group(1).replace(",", "")
|
||||
|
||||
|
||||
def _expand_time(m):
|
||||
"""
|
||||
将 24 小时制的时间转换为 12 小时制的时间表示方式。
|
||||
|
||||
|
||||
Examples:
|
||||
input: "13:00 / 4:00 / 13:30"
|
||||
output: "one o'clock p.m. / four o'clock am. / one thirty p.m."
|
||||
"""
|
||||
hours, minutes = map(int, m.group(1, 2))
|
||||
period = 'a.m.' if hours < 12 else 'p.m.'
|
||||
period = "a.m." if hours < 12 else "p.m."
|
||||
if hours > 12:
|
||||
hours -= 12
|
||||
|
||||
hour_word = _inflect.number_to_words(hours)
|
||||
minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ''
|
||||
minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ""
|
||||
|
||||
if minutes == 0:
|
||||
return f"{hour_word} o'clock {period}"
|
||||
@@ -103,7 +106,7 @@ def _expand_measurement(m):
|
||||
sign = m.group(3)
|
||||
ptr = 1
|
||||
# 想不到怎么方便的取数字,又懒得改正则,诶,1.2 反正也是复数读法,干脆直接去掉 "."
|
||||
num = int(m.group(1).replace(sign, '').replace(".",''))
|
||||
num = int(m.group(1).replace(sign, "").replace(".", ""))
|
||||
decimal_part = m.group(2)
|
||||
# 上面判断的漏洞,比如 0.1 的情况,在这里排除了
|
||||
if decimal_part == None and num == 1:
|
||||
@@ -116,23 +119,24 @@ def _expand_pounds(m):
|
||||
没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起
|
||||
"""
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
parts = match.split(".")
|
||||
if len(parts) > 2:
|
||||
return match + ' pounds' # Unexpected format
|
||||
return match + " pounds" # Unexpected format
|
||||
pounds = int(parts[0]) if parts[0] else 0
|
||||
pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
|
||||
pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
|
||||
if pounds and pence:
|
||||
pound_unit = 'pound' if pounds == 1 else 'pounds'
|
||||
penny_unit = 'penny' if pence == 1 else 'pence'
|
||||
return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit)
|
||||
pound_unit = "pound" if pounds == 1 else "pounds"
|
||||
penny_unit = "penny" if pence == 1 else "pence"
|
||||
return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit)
|
||||
elif pounds:
|
||||
pound_unit = 'pound' if pounds == 1 else 'pounds'
|
||||
return '%s %s' % (pounds, pound_unit)
|
||||
pound_unit = "pound" if pounds == 1 else "pounds"
|
||||
return "%s %s" % (pounds, pound_unit)
|
||||
elif pence:
|
||||
penny_unit = 'penny' if pence == 1 else 'pence'
|
||||
return '%s %s' % (pence, penny_unit)
|
||||
penny_unit = "penny" if pence == 1 else "pence"
|
||||
return "%s %s" % (pence, penny_unit)
|
||||
else:
|
||||
return 'zero pounds'
|
||||
return "zero pounds"
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
"""
|
||||
@@ -142,23 +146,24 @@ def _expand_dollars(m):
|
||||
output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents"
|
||||
"""
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
parts = match.split(".")
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
return match + " dollars" # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
|
||||
cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
return "%s %s" % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return "%s %s" % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
return "zero dollars"
|
||||
|
||||
|
||||
# 小数的处理
|
||||
def _expand_decimal_number(m):
|
||||
@@ -168,11 +173,11 @@ def _expand_decimal_number(m):
|
||||
output: "thirteen point two three four"
|
||||
"""
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
parts = match.split(".")
|
||||
words = []
|
||||
# 遍历字符串中的每个字符
|
||||
for char in parts[1]:
|
||||
if char == '.':
|
||||
if char == ".":
|
||||
words.append("point")
|
||||
else:
|
||||
words.append(char)
|
||||
@@ -186,7 +191,7 @@ def _expend_fraction(m):
|
||||
规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法.
|
||||
规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves.
|
||||
Examples:
|
||||
|
||||
|
||||
| Written | Said |
|
||||
|:---:|:---:|
|
||||
| 1/3 | one third |
|
||||
@@ -196,39 +201,41 @@ def _expend_fraction(m):
|
||||
| 3/2 | three halves |
|
||||
"""
|
||||
match = m.group(0)
|
||||
numerator, denominator = map(int, match.split('/'))
|
||||
numerator, denominator = map(int, match.split("/"))
|
||||
|
||||
numerator_part = _inflect.number_to_words(numerator)
|
||||
if denominator == 2:
|
||||
if numerator == 1:
|
||||
denominator_part = 'half'
|
||||
denominator_part = "half"
|
||||
else:
|
||||
denominator_part = 'halves'
|
||||
denominator_part = "halves"
|
||||
elif denominator == 1:
|
||||
return f'{numerator_part}'
|
||||
return f"{numerator_part}"
|
||||
else:
|
||||
denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator))
|
||||
if numerator > 1:
|
||||
denominator_part += 's'
|
||||
denominator_part += "s"
|
||||
|
||||
return f"{numerator_part} {denominator_part}"
|
||||
|
||||
return f'{numerator_part} {denominator_part}'
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
return "two thousand"
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
return "two thousand " + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
return _inflect.number_to_words(num // 100) + " hundred"
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
return _inflect.number_to_words(num, andword="")
|
||||
|
||||
|
||||
def normalize(text):
|
||||
@@ -238,7 +245,7 @@ def normalize(text):
|
||||
"""
|
||||
|
||||
text = re.sub(_ordinal_number_re, _convert_ordinal, text)
|
||||
text = re.sub(r'(?<!\d)-|-(?!\d)', ' minus ', text)
|
||||
text = re.sub(r"(?<!\d)-|-(?!\d)", " minus ", text)
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_time_re, _expand_time, text)
|
||||
text = re.sub(_measurement_re, _expand_measurement, text)
|
||||
@@ -251,19 +258,20 @@ def normalize(text):
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
|
||||
text = ''.join(char for char in unicodedata.normalize('NFD', text)
|
||||
if unicodedata.category(char) != 'Mn') # Strip accents
|
||||
text = "".join(
|
||||
char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn"
|
||||
) # Strip accents
|
||||
|
||||
text = re.sub("%", " percent", text)
|
||||
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
|
||||
text = re.sub(r"(?i)i\.e\.", "that is", text)
|
||||
text = re.sub(r"(?i)e\.g\.", "for example", text)
|
||||
# 增加纯大写单词拆分
|
||||
text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
|
||||
text = re.sub(r"(?<!^)(?<![\s])([A-Z])", r" \1", text)
|
||||
return text
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
# 我觉得其实可以把切分结果展示出来(只读,或者修改不影响传给TTS的实际text)
|
||||
# 然后让用户确认后再输入给 TTS,可以让用户检查自己有没有不标准的输入
|
||||
print(normalize("1. test ordinal number 1st"))
|
||||
@@ -272,4 +280,4 @@ if __name__ == '__main__':
|
||||
print(normalize("1st, 22nd"))
|
||||
print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
|
||||
print(normalize("a test of time 4:00, 13:00, 13:30"))
|
||||
print(normalize("a test of temperature 4°F, 23°C, -19°C"))
|
||||
print(normalize("a test of temperature 4°F, 23°C, -19°C"))
|
||||
|
||||
@@ -8,10 +8,10 @@ from text.symbols import punctuation
|
||||
|
||||
from text.symbols2 import symbols
|
||||
|
||||
import unicodedata
|
||||
from builtins import str as unicode
|
||||
from text.en_normalization.expend import normalize
|
||||
from nltk.tokenize import TweetTokenizer
|
||||
|
||||
word_tokenize = TweetTokenizer().tokenize
|
||||
from nltk import pos_tag
|
||||
|
||||
@@ -122,9 +122,9 @@ def replace_phs(phs):
|
||||
|
||||
|
||||
def replace_consecutive_punctuation(text):
|
||||
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||
pattern = f'([{punctuations}\s])([{punctuations}])+'
|
||||
result = re.sub(pattern, r'\1', text)
|
||||
punctuations = "".join(re.escape(p) for p in punctuation)
|
||||
pattern = f"([{punctuations}\s])([{punctuations}])+"
|
||||
result = re.sub(pattern, r"\1", text)
|
||||
return result
|
||||
|
||||
|
||||
@@ -183,6 +183,7 @@ def read_dict_new():
|
||||
|
||||
return g2p_dict
|
||||
|
||||
|
||||
def hot_reload_hot(g2p_dict):
|
||||
with open(CMU_DICT_HOT_PATH) as f:
|
||||
line = f.readline()
|
||||
@@ -259,9 +260,12 @@ class en_G2p(G2p):
|
||||
del self.cmu[word.lower()]
|
||||
|
||||
# 修正多音字
|
||||
self.homograph2features["read"] = (['R', 'IY1', 'D'], ['R', 'EH1', 'D'], 'VBP')
|
||||
self.homograph2features["complex"] = (['K', 'AH0', 'M', 'P', 'L', 'EH1', 'K', 'S'], ['K', 'AA1', 'M', 'P', 'L', 'EH0', 'K', 'S'], 'JJ')
|
||||
|
||||
self.homograph2features["read"] = (["R", "IY1", "D"], ["R", "EH1", "D"], "VBP")
|
||||
self.homograph2features["complex"] = (
|
||||
["K", "AH0", "M", "P", "L", "EH1", "K", "S"],
|
||||
["K", "AA1", "M", "P", "L", "EH0", "K", "S"],
|
||||
"JJ",
|
||||
)
|
||||
|
||||
def __call__(self, text):
|
||||
# tokenization
|
||||
@@ -280,7 +284,7 @@ class en_G2p(G2p):
|
||||
elif len(word) == 1:
|
||||
# 单读 A 发音修正, 这里需要原格式 o_word 判断大写
|
||||
if o_word == "A":
|
||||
pron = ['EY1']
|
||||
pron = ["EY1"]
|
||||
else:
|
||||
pron = self.cmu[word][0]
|
||||
# g2p_en 原版多音字处理
|
||||
@@ -289,7 +293,7 @@ class en_G2p(G2p):
|
||||
if pos.startswith(pos1):
|
||||
pron = pron1
|
||||
# pos1比pos长仅出现在read
|
||||
elif len(pos) < len(pos1) and pos == pos1[:len(pos)]:
|
||||
elif len(pos) < len(pos1) and pos == pos1[: len(pos)]:
|
||||
pron = pron1
|
||||
else:
|
||||
pron = pron2
|
||||
@@ -302,7 +306,6 @@ class en_G2p(G2p):
|
||||
|
||||
return prons[:-1]
|
||||
|
||||
|
||||
def qryword(self, o_word):
|
||||
word = o_word.lower()
|
||||
|
||||
@@ -320,7 +323,7 @@ class en_G2p(G2p):
|
||||
for w in word:
|
||||
# 单读 A 发音修正, 此处不存在大写的情况
|
||||
if w == "a":
|
||||
phones.extend(['EY1'])
|
||||
phones.extend(["EY1"])
|
||||
elif not w.isalpha():
|
||||
phones.extend([w])
|
||||
else:
|
||||
@@ -331,23 +334,23 @@ class en_G2p(G2p):
|
||||
if re.match(r"^([a-z]+)('s)$", word):
|
||||
phones = self.qryword(word[:-2])[:]
|
||||
# P T K F TH HH 无声辅音结尾 's 发 ['S']
|
||||
if phones[-1] in ['P', 'T', 'K', 'F', 'TH', 'HH']:
|
||||
phones.extend(['S'])
|
||||
if phones[-1] in ["P", "T", "K", "F", "TH", "HH"]:
|
||||
phones.extend(["S"])
|
||||
# S Z SH ZH CH JH 擦声结尾 's 发 ['IH1', 'Z'] 或 ['AH0', 'Z']
|
||||
elif phones[-1] in ['S', 'Z', 'SH', 'ZH', 'CH', 'JH']:
|
||||
phones.extend(['AH0', 'Z'])
|
||||
elif phones[-1] in ["S", "Z", "SH", "ZH", "CH", "JH"]:
|
||||
phones.extend(["AH0", "Z"])
|
||||
# B D G DH V M N NG L R W Y 有声辅音结尾 's 发 ['Z']
|
||||
# AH0 AH1 AH2 EY0 EY1 EY2 AE0 AE1 AE2 EH0 EH1 EH2 OW0 OW1 OW2 UH0 UH1 UH2 IY0 IY1 IY2 AA0 AA1 AA2 AO0 AO1 AO2
|
||||
# ER ER0 ER1 ER2 UW0 UW1 UW2 AY0 AY1 AY2 AW0 AW1 AW2 OY0 OY1 OY2 IH IH0 IH1 IH2 元音结尾 's 发 ['Z']
|
||||
else:
|
||||
phones.extend(['Z'])
|
||||
phones.extend(["Z"])
|
||||
return phones
|
||||
|
||||
# 尝试进行分词,应对复合词
|
||||
comps = wordsegment.segment(word.lower())
|
||||
|
||||
# 无法分词的送回去预测
|
||||
if len(comps)==1:
|
||||
if len(comps) == 1:
|
||||
return self.predict(word)
|
||||
|
||||
# 可以分词的递归处理
|
||||
|
||||
@@ -1 +1 @@
|
||||
from text.g2pw.g2pw import *
|
||||
from text.g2pw.g2pw import *
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
Credits
|
||||
This code is modified from https://github.com/GitYCC/g2pW
|
||||
"""
|
||||
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
@@ -23,21 +24,24 @@ import numpy as np
|
||||
|
||||
from .utils import tokenize_and_map
|
||||
|
||||
ANCHOR_CHAR = '▁'
|
||||
ANCHOR_CHAR = "▁"
|
||||
|
||||
|
||||
def prepare_onnx_input(tokenizer,
|
||||
labels: List[str],
|
||||
char2phonemes: Dict[str, List[int]],
|
||||
chars: List[str],
|
||||
texts: List[str],
|
||||
query_ids: List[int],
|
||||
use_mask: bool=False,
|
||||
window_size: int=None,
|
||||
max_len: int=512) -> Dict[str, np.array]:
|
||||
def prepare_onnx_input(
|
||||
tokenizer,
|
||||
labels: List[str],
|
||||
char2phonemes: Dict[str, List[int]],
|
||||
chars: List[str],
|
||||
texts: List[str],
|
||||
query_ids: List[int],
|
||||
use_mask: bool = False,
|
||||
window_size: int = None,
|
||||
max_len: int = 512,
|
||||
) -> Dict[str, np.array]:
|
||||
if window_size is not None:
|
||||
truncated_texts, truncated_query_ids = _truncate_texts(
|
||||
window_size=window_size, texts=texts, query_ids=query_ids)
|
||||
window_size=window_size, texts=texts, query_ids=query_ids
|
||||
)
|
||||
input_ids = []
|
||||
token_type_ids = []
|
||||
attention_masks = []
|
||||
@@ -50,33 +54,27 @@ def prepare_onnx_input(tokenizer,
|
||||
query_id = (truncated_query_ids if window_size else query_ids)[idx]
|
||||
|
||||
try:
|
||||
tokens, text2token, token2text = tokenize_and_map(
|
||||
tokenizer=tokenizer, text=text)
|
||||
tokens, text2token, token2text = tokenize_and_map(tokenizer=tokenizer, text=text)
|
||||
except Exception:
|
||||
print(f'warning: text "{text}" is invalid')
|
||||
return {}
|
||||
|
||||
text, query_id, tokens, text2token, token2text = _truncate(
|
||||
max_len=max_len,
|
||||
text=text,
|
||||
query_id=query_id,
|
||||
tokens=tokens,
|
||||
text2token=text2token,
|
||||
token2text=token2text)
|
||||
max_len=max_len, text=text, query_id=query_id, tokens=tokens, text2token=text2token, token2text=token2text
|
||||
)
|
||||
|
||||
processed_tokens = ['[CLS]'] + tokens + ['[SEP]']
|
||||
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
|
||||
|
||||
input_id = list(
|
||||
np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
||||
token_type_id = list(np.zeros((len(processed_tokens), ), dtype=int))
|
||||
attention_mask = list(np.ones((len(processed_tokens), ), dtype=int))
|
||||
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
||||
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
|
||||
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
|
||||
|
||||
query_char = text[query_id]
|
||||
phoneme_mask = [1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] \
|
||||
if use_mask else [1] * len(labels)
|
||||
phoneme_mask = (
|
||||
[1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))] if use_mask else [1] * len(labels)
|
||||
)
|
||||
char_id = chars.index(query_char)
|
||||
position_id = text2token[
|
||||
query_id] + 1 # [CLS] token locate at first place
|
||||
position_id = text2token[query_id] + 1 # [CLS] token locate at first place
|
||||
|
||||
input_ids.append(input_id)
|
||||
token_type_ids.append(token_type_id)
|
||||
@@ -86,18 +84,17 @@ def prepare_onnx_input(tokenizer,
|
||||
position_ids.append(position_id)
|
||||
|
||||
outputs = {
|
||||
'input_ids': np.array(input_ids).astype(np.int64),
|
||||
'token_type_ids': np.array(token_type_ids).astype(np.int64),
|
||||
'attention_masks': np.array(attention_masks).astype(np.int64),
|
||||
'phoneme_masks': np.array(phoneme_masks).astype(np.float32),
|
||||
'char_ids': np.array(char_ids).astype(np.int64),
|
||||
'position_ids': np.array(position_ids).astype(np.int64),
|
||||
"input_ids": np.array(input_ids).astype(np.int64),
|
||||
"token_type_ids": np.array(token_type_ids).astype(np.int64),
|
||||
"attention_masks": np.array(attention_masks).astype(np.int64),
|
||||
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
|
||||
"char_ids": np.array(char_ids).astype(np.int64),
|
||||
"position_ids": np.array(position_ids).astype(np.int64),
|
||||
}
|
||||
return outputs
|
||||
|
||||
|
||||
def _truncate_texts(window_size: int, texts: List[str],
|
||||
query_ids: List[int]) -> Tuple[List[str], List[int]]:
|
||||
def _truncate_texts(window_size: int, texts: List[str], query_ids: List[int]) -> Tuple[List[str], List[int]]:
|
||||
truncated_texts = []
|
||||
truncated_query_ids = []
|
||||
for text, query_id in zip(texts, query_ids):
|
||||
@@ -111,12 +108,9 @@ def _truncate_texts(window_size: int, texts: List[str],
|
||||
return truncated_texts, truncated_query_ids
|
||||
|
||||
|
||||
def _truncate(max_len: int,
|
||||
text: str,
|
||||
query_id: int,
|
||||
tokens: List[str],
|
||||
text2token: List[int],
|
||||
token2text: List[Tuple[int]]):
|
||||
def _truncate(
|
||||
max_len: int, text: str, query_id: int, tokens: List[str], text2token: List[int], token2text: List[Tuple[int]]
|
||||
):
|
||||
truncate_len = max_len - 2
|
||||
if len(tokens) <= truncate_len:
|
||||
return (text, query_id, tokens, text2token, token2text)
|
||||
@@ -137,14 +131,16 @@ def _truncate(max_len: int,
|
||||
start = token2text[token_start][0]
|
||||
end = token2text[token_end - 1][1]
|
||||
|
||||
return (text[start:end], query_id - start, tokens[token_start:token_end], [
|
||||
i - token_start if i is not None else None
|
||||
for i in text2token[start:end]
|
||||
], [(s - start, e - start) for s, e in token2text[token_start:token_end]])
|
||||
return (
|
||||
text[start:end],
|
||||
query_id - start,
|
||||
tokens[token_start:token_end],
|
||||
[i - token_start if i is not None else None for i in text2token[start:end]],
|
||||
[(s - start, e - start) for s, e in token2text[token_start:token_end]],
|
||||
)
|
||||
|
||||
|
||||
def get_phoneme_labels(polyphonic_chars: List[List[str]]
|
||||
) -> Tuple[List[str], Dict[str, List[int]]]:
|
||||
def get_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
|
||||
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
|
||||
char2phonemes = {}
|
||||
for char, phoneme in polyphonic_chars:
|
||||
@@ -154,13 +150,11 @@ def get_phoneme_labels(polyphonic_chars: List[List[str]]
|
||||
return labels, char2phonemes
|
||||
|
||||
|
||||
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]
|
||||
) -> Tuple[List[str], Dict[str, List[int]]]:
|
||||
labels = sorted(
|
||||
list(set([f'{char} {phoneme}' for char, phoneme in polyphonic_chars])))
|
||||
def get_char_phoneme_labels(polyphonic_chars: List[List[str]]) -> Tuple[List[str], Dict[str, List[int]]]:
|
||||
labels = sorted(list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars])))
|
||||
char2phonemes = {}
|
||||
for char, phoneme in polyphonic_chars:
|
||||
if char not in char2phonemes:
|
||||
char2phonemes[char] = []
|
||||
char2phonemes[char].append(labels.index(f'{char} {phoneme}'))
|
||||
char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
|
||||
return labels, char2phonemes
|
||||
|
||||
@@ -17,17 +17,25 @@ PP_FIX_DICT_PATH = os.path.join(current_file_path, "polyphonic-fix.rep")
|
||||
|
||||
|
||||
class G2PWPinyin(Pinyin):
|
||||
def __init__(self, model_dir='G2PWModel/', model_source=None,
|
||||
enable_non_tradional_chinese=True,
|
||||
v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
model_dir="G2PWModel/",
|
||||
model_source=None,
|
||||
enable_non_tradional_chinese=True,
|
||||
v_to_u=False,
|
||||
neutral_tone_with_five=False,
|
||||
tone_sandhi=False,
|
||||
**kwargs,
|
||||
):
|
||||
self._g2pw = G2PWOnnxConverter(
|
||||
model_dir=model_dir,
|
||||
style='pinyin',
|
||||
style="pinyin",
|
||||
model_source=model_source,
|
||||
enable_non_tradional_chinese=enable_non_tradional_chinese,
|
||||
)
|
||||
self._converter = Converter(
|
||||
self._g2pw, v_to_u=v_to_u,
|
||||
self._g2pw,
|
||||
v_to_u=v_to_u,
|
||||
neutral_tone_with_five=neutral_tone_with_five,
|
||||
tone_sandhi=tone_sandhi,
|
||||
)
|
||||
@@ -37,31 +45,25 @@ class G2PWPinyin(Pinyin):
|
||||
|
||||
|
||||
class Converter(UltimateConverter):
|
||||
def __init__(self, g2pw_instance, v_to_u=False,
|
||||
neutral_tone_with_five=False,
|
||||
tone_sandhi=False, **kwargs):
|
||||
def __init__(self, g2pw_instance, v_to_u=False, neutral_tone_with_five=False, tone_sandhi=False, **kwargs):
|
||||
super(Converter, self).__init__(
|
||||
v_to_u=v_to_u,
|
||||
neutral_tone_with_five=neutral_tone_with_five,
|
||||
tone_sandhi=tone_sandhi, **kwargs)
|
||||
v_to_u=v_to_u, neutral_tone_with_five=neutral_tone_with_five, tone_sandhi=tone_sandhi, **kwargs
|
||||
)
|
||||
|
||||
self._g2pw = g2pw_instance
|
||||
|
||||
def convert(self, words, style, heteronym, errors, strict, **kwargs):
|
||||
pys = []
|
||||
if RE_HANS.match(words):
|
||||
pys = self._to_pinyin(words, style=style, heteronym=heteronym,
|
||||
errors=errors, strict=strict)
|
||||
pys = self._to_pinyin(words, style=style, heteronym=heteronym, errors=errors, strict=strict)
|
||||
post_data = self.post_pinyin(words, heteronym, pys)
|
||||
if post_data is not None:
|
||||
pys = post_data
|
||||
|
||||
pys = self.convert_styles(
|
||||
pys, words, style, heteronym, errors, strict)
|
||||
pys = self.convert_styles(pys, words, style, heteronym, errors, strict)
|
||||
|
||||
else:
|
||||
py = self.handle_nopinyin(words, style=style, errors=errors,
|
||||
heteronym=heteronym, strict=strict)
|
||||
py = self.handle_nopinyin(words, style=style, errors=errors, heteronym=heteronym, strict=strict)
|
||||
if py:
|
||||
pys.extend(py)
|
||||
|
||||
@@ -73,13 +75,11 @@ class Converter(UltimateConverter):
|
||||
g2pw_pinyin = self._g2pw(han)
|
||||
|
||||
if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
|
||||
return super(Converter, self).convert(
|
||||
han, Style.TONE, heteronym, errors, strict, **kwargs)
|
||||
return super(Converter, self).convert(han, Style.TONE, heteronym, errors, strict, **kwargs)
|
||||
|
||||
for i, item in enumerate(g2pw_pinyin[0]):
|
||||
if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
|
||||
py = super(Converter, self).convert(
|
||||
han[i], Style.TONE, heteronym, errors, strict, **kwargs)
|
||||
py = super(Converter, self).convert(han[i], Style.TONE, heteronym, errors, strict, **kwargs)
|
||||
pinyins.extend(py)
|
||||
else:
|
||||
pinyins.append([to_tone(item)])
|
||||
@@ -104,7 +104,7 @@ def _remove_dup_and_empty(lst_list):
|
||||
if lst:
|
||||
new_lst_list.append(lst)
|
||||
else:
|
||||
new_lst_list.append([''])
|
||||
new_lst_list.append([""])
|
||||
|
||||
return new_lst_list
|
||||
|
||||
@@ -127,17 +127,17 @@ def get_dict():
|
||||
|
||||
def read_dict():
|
||||
polyphonic_dict = {}
|
||||
with open(PP_DICT_PATH,encoding="utf-8") as f:
|
||||
with open(PP_DICT_PATH, encoding="utf-8") as f:
|
||||
line = f.readline()
|
||||
while line:
|
||||
key, value_str = line.split(':')
|
||||
key, value_str = line.split(":")
|
||||
value = eval(value_str.strip())
|
||||
polyphonic_dict[key.strip()] = value
|
||||
line = f.readline()
|
||||
with open(PP_FIX_DICT_PATH,encoding="utf-8") as f:
|
||||
with open(PP_FIX_DICT_PATH, encoding="utf-8") as f:
|
||||
line = f.readline()
|
||||
while line:
|
||||
key, value_str = line.split(':')
|
||||
key, value_str = line.split(":")
|
||||
value = eval(value_str.strip())
|
||||
polyphonic_dict[key.strip()] = value
|
||||
line = f.readline()
|
||||
|
||||
@@ -2,44 +2,43 @@
|
||||
# This code is modified from https://github.com/GitYCC/g2pW
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
import json
|
||||
import os
|
||||
import zipfile,requests
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
import zipfile
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import onnxruntime
|
||||
import requests
|
||||
|
||||
onnxruntime.set_default_logger_severity(3)
|
||||
from opencc import OpenCC
|
||||
from pypinyin import Style, pinyin
|
||||
from transformers import AutoTokenizer
|
||||
from pypinyin import pinyin
|
||||
from pypinyin import Style
|
||||
|
||||
from .dataset import get_char_phoneme_labels
|
||||
from .dataset import get_phoneme_labels
|
||||
from .dataset import prepare_onnx_input
|
||||
from .utils import load_config
|
||||
from ..zh_normalization.char_convert import tranditional_to_simplified
|
||||
from .dataset import get_char_phoneme_labels, get_phoneme_labels, prepare_onnx_input
|
||||
from .utils import load_config
|
||||
|
||||
model_version = '1.1'
|
||||
model_version = "1.1"
|
||||
|
||||
|
||||
def predict(session, onnx_input: Dict[str, Any],
|
||||
labels: List[str]) -> Tuple[List[str], List[float]]:
|
||||
def predict(session, onnx_input: Dict[str, Any], labels: List[str]) -> Tuple[List[str], List[float]]:
|
||||
all_preds = []
|
||||
all_confidences = []
|
||||
probs = session.run([], {
|
||||
"input_ids": onnx_input['input_ids'],
|
||||
"token_type_ids": onnx_input['token_type_ids'],
|
||||
"attention_mask": onnx_input['attention_masks'],
|
||||
"phoneme_mask": onnx_input['phoneme_masks'],
|
||||
"char_ids": onnx_input['char_ids'],
|
||||
"position_ids": onnx_input['position_ids']
|
||||
})[0]
|
||||
probs = session.run(
|
||||
[],
|
||||
{
|
||||
"input_ids": onnx_input["input_ids"],
|
||||
"token_type_ids": onnx_input["token_type_ids"],
|
||||
"attention_mask": onnx_input["attention_masks"],
|
||||
"phoneme_mask": onnx_input["phoneme_masks"],
|
||||
"char_ids": onnx_input["char_ids"],
|
||||
"position_ids": onnx_input["position_ids"],
|
||||
},
|
||||
)[0]
|
||||
|
||||
preds = np.argmax(probs, axis=1).tolist()
|
||||
max_probs = []
|
||||
@@ -51,17 +50,17 @@ def predict(session, onnx_input: Dict[str, Any],
|
||||
return all_preds, all_confidences
|
||||
|
||||
|
||||
def download_and_decompress(model_dir: str='G2PWModel/'):
|
||||
def download_and_decompress(model_dir: str = "G2PWModel/"):
|
||||
if not os.path.exists(model_dir):
|
||||
parent_directory = os.path.dirname(model_dir)
|
||||
zip_dir = os.path.join(parent_directory,"G2PWModel_1.1.zip")
|
||||
extract_dir = os.path.join(parent_directory,"G2PWModel_1.1")
|
||||
extract_dir_new = os.path.join(parent_directory,"G2PWModel")
|
||||
zip_dir = os.path.join(parent_directory, "G2PWModel_1.1.zip")
|
||||
extract_dir = os.path.join(parent_directory, "G2PWModel_1.1")
|
||||
extract_dir_new = os.path.join(parent_directory, "G2PWModel")
|
||||
print("Downloading g2pw model...")
|
||||
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip"#"https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
||||
modelscope_url = "https://www.modelscope.cn/models/kamiorinn/g2pw/resolve/master/G2PWModel_1.1.zip" # "https://paddlespeech.cdn.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip"
|
||||
with requests.get(modelscope_url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
with open(zip_dir, 'wb') as f:
|
||||
with open(zip_dir, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
@@ -69,17 +68,20 @@ def download_and_decompress(model_dir: str='G2PWModel/'):
|
||||
print("Extracting g2pw model...")
|
||||
with zipfile.ZipFile(zip_dir, "r") as zip_ref:
|
||||
zip_ref.extractall(parent_directory)
|
||||
|
||||
|
||||
os.rename(extract_dir, extract_dir_new)
|
||||
|
||||
return model_dir
|
||||
|
||||
|
||||
class G2PWOnnxConverter:
|
||||
def __init__(self,
|
||||
model_dir: str='G2PWModel/',
|
||||
style: str='bopomofo',
|
||||
model_source: str=None,
|
||||
enable_non_tradional_chinese: bool=False):
|
||||
def __init__(
|
||||
self,
|
||||
model_dir: str = "G2PWModel/",
|
||||
style: str = "bopomofo",
|
||||
model_source: str = None,
|
||||
enable_non_tradional_chinese: bool = False,
|
||||
):
|
||||
uncompress_path = download_and_decompress(model_dir)
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
@@ -87,41 +89,59 @@ class G2PWOnnxConverter:
|
||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||
sess_options.intra_op_num_threads = 2
|
||||
try:
|
||||
self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
|
||||
self.session_g2pW = onnxruntime.InferenceSession(
|
||||
os.path.join(uncompress_path, "g2pW.onnx"),
|
||||
sess_options=sess_options,
|
||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
||||
)
|
||||
except:
|
||||
self.session_g2pW = onnxruntime.InferenceSession(os.path.join(uncompress_path, 'g2pW.onnx'),sess_options=sess_options, providers=['CPUExecutionProvider'])
|
||||
self.config = load_config(
|
||||
config_path=os.path.join(uncompress_path, 'config.py'),
|
||||
use_default=True)
|
||||
self.session_g2pW = onnxruntime.InferenceSession(
|
||||
os.path.join(uncompress_path, "g2pW.onnx"),
|
||||
sess_options=sess_options,
|
||||
providers=["CPUExecutionProvider"],
|
||||
)
|
||||
self.config = load_config(config_path=os.path.join(uncompress_path, "config.py"), use_default=True)
|
||||
|
||||
self.model_source = model_source if model_source else self.config.model_source
|
||||
self.enable_opencc = enable_non_tradional_chinese
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_source)
|
||||
|
||||
polyphonic_chars_path = os.path.join(uncompress_path,
|
||||
'POLYPHONIC_CHARS.txt')
|
||||
monophonic_chars_path = os.path.join(uncompress_path,
|
||||
'MONOPHONIC_CHARS.txt')
|
||||
polyphonic_chars_path = os.path.join(uncompress_path, "POLYPHONIC_CHARS.txt")
|
||||
monophonic_chars_path = os.path.join(uncompress_path, "MONOPHONIC_CHARS.txt")
|
||||
self.polyphonic_chars = [
|
||||
line.split('\t')
|
||||
for line in open(polyphonic_chars_path, encoding='utf-8').read()
|
||||
.strip().split('\n')
|
||||
line.split("\t") for line in open(polyphonic_chars_path, encoding="utf-8").read().strip().split("\n")
|
||||
]
|
||||
self.non_polyphonic = {
|
||||
'一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
|
||||
'肖', '瘙', '誒', '泊', '听', '噢'
|
||||
"一",
|
||||
"不",
|
||||
"和",
|
||||
"咋",
|
||||
"嗲",
|
||||
"剖",
|
||||
"差",
|
||||
"攢",
|
||||
"倒",
|
||||
"難",
|
||||
"奔",
|
||||
"勁",
|
||||
"拗",
|
||||
"肖",
|
||||
"瘙",
|
||||
"誒",
|
||||
"泊",
|
||||
"听",
|
||||
"噢",
|
||||
}
|
||||
self.non_monophonic = {'似', '攢'}
|
||||
self.non_monophonic = {"似", "攢"}
|
||||
self.monophonic_chars = [
|
||||
line.split('\t')
|
||||
for line in open(monophonic_chars_path, encoding='utf-8').read()
|
||||
.strip().split('\n')
|
||||
line.split("\t") for line in open(monophonic_chars_path, encoding="utf-8").read().strip().split("\n")
|
||||
]
|
||||
self.labels, self.char2phonemes = get_char_phoneme_labels(
|
||||
polyphonic_chars=self.polyphonic_chars
|
||||
) if self.config.use_char_phoneme else get_phoneme_labels(
|
||||
polyphonic_chars=self.polyphonic_chars)
|
||||
self.labels, self.char2phonemes = (
|
||||
get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
|
||||
if self.config.use_char_phoneme
|
||||
else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
|
||||
)
|
||||
|
||||
self.chars = sorted(list(self.char2phonemes.keys()))
|
||||
|
||||
@@ -130,41 +150,29 @@ class G2PWOnnxConverter:
|
||||
if char in self.polyphonic_chars_new:
|
||||
self.polyphonic_chars_new.remove(char)
|
||||
|
||||
self.monophonic_chars_dict = {
|
||||
char: phoneme
|
||||
for char, phoneme in self.monophonic_chars
|
||||
}
|
||||
self.monophonic_chars_dict = {char: phoneme for char, phoneme in self.monophonic_chars}
|
||||
for char in self.non_monophonic:
|
||||
if char in self.monophonic_chars_dict:
|
||||
self.monophonic_chars_dict.pop(char)
|
||||
|
||||
self.pos_tags = [
|
||||
'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI'
|
||||
]
|
||||
self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
|
||||
|
||||
with open(
|
||||
os.path.join(uncompress_path,
|
||||
'bopomofo_to_pinyin_wo_tune_dict.json'),
|
||||
'r',
|
||||
encoding='utf-8') as fr:
|
||||
with open(os.path.join(uncompress_path, "bopomofo_to_pinyin_wo_tune_dict.json"), "r", encoding="utf-8") as fr:
|
||||
self.bopomofo_convert_dict = json.load(fr)
|
||||
self.style_convert_func = {
|
||||
'bopomofo': lambda x: x,
|
||||
'pinyin': self._convert_bopomofo_to_pinyin,
|
||||
"bopomofo": lambda x: x,
|
||||
"pinyin": self._convert_bopomofo_to_pinyin,
|
||||
}[style]
|
||||
|
||||
with open(
|
||||
os.path.join(uncompress_path, 'char_bopomofo_dict.json'),
|
||||
'r',
|
||||
encoding='utf-8') as fr:
|
||||
with open(os.path.join(uncompress_path, "char_bopomofo_dict.json"), "r", encoding="utf-8") as fr:
|
||||
self.char_bopomofo_dict = json.load(fr)
|
||||
|
||||
if self.enable_opencc:
|
||||
self.cc = OpenCC('s2tw')
|
||||
self.cc = OpenCC("s2tw")
|
||||
|
||||
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
|
||||
tone = bopomofo[-1]
|
||||
assert tone in '12345'
|
||||
assert tone in "12345"
|
||||
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
|
||||
if component:
|
||||
return component + tone
|
||||
@@ -184,8 +192,7 @@ class G2PWOnnxConverter:
|
||||
translated_sentences.append(translated_sent)
|
||||
sentences = translated_sentences
|
||||
|
||||
texts, query_ids, sent_ids, partial_results = self._prepare_data(
|
||||
sentences=sentences)
|
||||
texts, query_ids, sent_ids, partial_results = self._prepare_data(sentences=sentences)
|
||||
if len(texts) == 0:
|
||||
# sentences no polyphonic words
|
||||
return partial_results
|
||||
@@ -198,14 +205,12 @@ class G2PWOnnxConverter:
|
||||
texts=texts,
|
||||
query_ids=query_ids,
|
||||
use_mask=self.config.use_mask,
|
||||
window_size=None)
|
||||
window_size=None,
|
||||
)
|
||||
|
||||
preds, confidences = predict(
|
||||
session=self.session_g2pW,
|
||||
onnx_input=onnx_input,
|
||||
labels=self.labels)
|
||||
preds, confidences = predict(session=self.session_g2pW, onnx_input=onnx_input, labels=self.labels)
|
||||
if self.config.use_char_phoneme:
|
||||
preds = [pred.split(' ')[1] for pred in preds]
|
||||
preds = [pred.split(" ")[1] for pred in preds]
|
||||
|
||||
results = partial_results
|
||||
for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
|
||||
@@ -213,15 +218,12 @@ class G2PWOnnxConverter:
|
||||
|
||||
return results
|
||||
|
||||
def _prepare_data(
|
||||
self, sentences: List[str]
|
||||
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
|
||||
def _prepare_data(self, sentences: List[str]) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
|
||||
texts, query_ids, sent_ids, partial_results = [], [], [], []
|
||||
for sent_id, sent in enumerate(sentences):
|
||||
# pypinyin works well for Simplified Chinese than Traditional Chinese
|
||||
sent_s = tranditional_to_simplified(sent)
|
||||
pypinyin_result = pinyin(
|
||||
sent_s, neutral_tone_with_five=True, style=Style.TONE3)
|
||||
pypinyin_result = pinyin(sent_s, neutral_tone_with_five=True, style=Style.TONE3)
|
||||
partial_result = [None] * len(sent)
|
||||
for i, char in enumerate(sent):
|
||||
if char in self.polyphonic_chars_new:
|
||||
@@ -229,8 +231,7 @@ class G2PWOnnxConverter:
|
||||
query_ids.append(i)
|
||||
sent_ids.append(sent_id)
|
||||
elif char in self.monophonic_chars_dict:
|
||||
partial_result[i] = self.style_convert_func(
|
||||
self.monophonic_chars_dict[char])
|
||||
partial_result[i] = self.style_convert_func(self.monophonic_chars_dict[char])
|
||||
elif char in self.char_bopomofo_dict:
|
||||
partial_result[i] = pypinyin_result[i][0]
|
||||
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
Credits
|
||||
This code is modified from https://github.com/GitYCC/g2pW
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
@@ -24,14 +25,14 @@ def wordize_and_map(text: str):
|
||||
index_map_from_text_to_word = []
|
||||
index_map_from_word_to_text = []
|
||||
while len(text) > 0:
|
||||
match_space = re.match(r'^ +', text)
|
||||
match_space = re.match(r"^ +", text)
|
||||
if match_space:
|
||||
space_str = match_space.group(0)
|
||||
index_map_from_text_to_word += [None] * len(space_str)
|
||||
text = text[len(space_str):]
|
||||
text = text[len(space_str) :]
|
||||
continue
|
||||
|
||||
match_en = re.match(r'^[a-zA-Z0-9]+', text)
|
||||
match_en = re.match(r"^[a-zA-Z0-9]+", text)
|
||||
if match_en:
|
||||
en_word = match_en.group(0)
|
||||
|
||||
@@ -42,7 +43,7 @@ def wordize_and_map(text: str):
|
||||
index_map_from_text_to_word += [len(words)] * len(en_word)
|
||||
|
||||
words.append(en_word)
|
||||
text = text[len(en_word):]
|
||||
text = text[len(en_word) :]
|
||||
else:
|
||||
word_start_pos = len(index_map_from_text_to_word)
|
||||
word_end_pos = word_start_pos + 1
|
||||
@@ -63,15 +64,14 @@ def tokenize_and_map(tokenizer, text: str):
|
||||
for word, (word_start, word_end) in zip(words, word2text):
|
||||
word_tokens = tokenizer.tokenize(word)
|
||||
|
||||
if len(word_tokens) == 0 or word_tokens == ['[UNK]']:
|
||||
if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
|
||||
index_map_from_token_to_text.append((word_start, word_end))
|
||||
tokens.append('[UNK]')
|
||||
tokens.append("[UNK]")
|
||||
else:
|
||||
current_word_start = word_start
|
||||
for word_token in word_tokens:
|
||||
word_token_len = len(re.sub(r'^##', '', word_token))
|
||||
index_map_from_token_to_text.append(
|
||||
(current_word_start, current_word_start + word_token_len))
|
||||
word_token_len = len(re.sub(r"^##", "", word_token))
|
||||
index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len))
|
||||
current_word_start = current_word_start + word_token_len
|
||||
tokens.append(word_token)
|
||||
|
||||
@@ -85,53 +85,51 @@ def tokenize_and_map(tokenizer, text: str):
|
||||
|
||||
def _load_config(config_path: os.PathLike):
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location('__init__', config_path)
|
||||
|
||||
spec = importlib.util.spec_from_file_location("__init__", config_path)
|
||||
config = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(config)
|
||||
return config
|
||||
|
||||
|
||||
default_config_dict = {
|
||||
'manual_seed': 1313,
|
||||
'model_source': 'bert-base-chinese',
|
||||
'window_size': 32,
|
||||
'num_workers': 2,
|
||||
'use_mask': True,
|
||||
'use_char_phoneme': False,
|
||||
'use_conditional': True,
|
||||
'param_conditional': {
|
||||
'affect_location': 'softmax',
|
||||
'bias': True,
|
||||
'char-linear': True,
|
||||
'pos-linear': False,
|
||||
'char+pos-second': True,
|
||||
'char+pos-second_lowrank': False,
|
||||
'lowrank_size': 0,
|
||||
'char+pos-second_fm': False,
|
||||
'fm_size': 0,
|
||||
'fix_mode': None,
|
||||
'count_json': 'train.count.json'
|
||||
"manual_seed": 1313,
|
||||
"model_source": "bert-base-chinese",
|
||||
"window_size": 32,
|
||||
"num_workers": 2,
|
||||
"use_mask": True,
|
||||
"use_char_phoneme": False,
|
||||
"use_conditional": True,
|
||||
"param_conditional": {
|
||||
"affect_location": "softmax",
|
||||
"bias": True,
|
||||
"char-linear": True,
|
||||
"pos-linear": False,
|
||||
"char+pos-second": True,
|
||||
"char+pos-second_lowrank": False,
|
||||
"lowrank_size": 0,
|
||||
"char+pos-second_fm": False,
|
||||
"fm_size": 0,
|
||||
"fix_mode": None,
|
||||
"count_json": "train.count.json",
|
||||
},
|
||||
'lr': 5e-5,
|
||||
'val_interval': 200,
|
||||
'num_iter': 10000,
|
||||
'use_focal': False,
|
||||
'param_focal': {
|
||||
'alpha': 0.0,
|
||||
'gamma': 0.7
|
||||
"lr": 5e-5,
|
||||
"val_interval": 200,
|
||||
"num_iter": 10000,
|
||||
"use_focal": False,
|
||||
"param_focal": {"alpha": 0.0, "gamma": 0.7},
|
||||
"use_pos": True,
|
||||
"param_pos ": {
|
||||
"weight": 0.1,
|
||||
"pos_joint_training": True,
|
||||
"train_pos_path": "train.pos",
|
||||
"valid_pos_path": "dev.pos",
|
||||
"test_pos_path": "test.pos",
|
||||
},
|
||||
'use_pos': True,
|
||||
'param_pos ': {
|
||||
'weight': 0.1,
|
||||
'pos_joint_training': True,
|
||||
'train_pos_path': 'train.pos',
|
||||
'valid_pos_path': 'dev.pos',
|
||||
'test_pos_path': 'test.pos'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def load_config(config_path: os.PathLike, use_default: bool=False):
|
||||
def load_config(config_path: os.PathLike, use_default: bool = False):
|
||||
config = _load_config(config_path)
|
||||
if use_default:
|
||||
for attr, val in default_config_dict.items():
|
||||
|
||||
@@ -2,43 +2,51 @@
|
||||
import re
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
try:
|
||||
import pyopenjtalk
|
||||
|
||||
current_file_path = os.path.dirname(__file__)
|
||||
|
||||
# 防止win下无法读取模型
|
||||
if os.name == 'nt':
|
||||
if os.name == "nt":
|
||||
python_dir = os.getcwd()
|
||||
OPEN_JTALK_DICT_DIR = pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8")
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', OPEN_JTALK_DICT_DIR)):
|
||||
if (OPEN_JTALK_DICT_DIR[:len(python_dir)].upper() == python_dir.upper()):
|
||||
OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR,python_dir))
|
||||
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", OPEN_JTALK_DICT_DIR)):
|
||||
if OPEN_JTALK_DICT_DIR[: len(python_dir)].upper() == python_dir.upper():
|
||||
OPEN_JTALK_DICT_DIR = os.path.join(os.path.relpath(OPEN_JTALK_DICT_DIR, python_dir))
|
||||
else:
|
||||
import shutil
|
||||
if not os.path.exists('TEMP'):
|
||||
os.mkdir('TEMP')
|
||||
|
||||
if not os.path.exists("TEMP"):
|
||||
os.mkdir("TEMP")
|
||||
if not os.path.exists(os.path.join("TEMP", "ja")):
|
||||
os.mkdir(os.path.join("TEMP", "ja"))
|
||||
if os.path.exists(os.path.join("TEMP", "ja", "open_jtalk_dic")):
|
||||
shutil.rmtree(os.path.join("TEMP", "ja", "open_jtalk_dic"))
|
||||
shutil.copytree(pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"), os.path.join("TEMP", "ja", "open_jtalk_dic"), )
|
||||
shutil.copytree(
|
||||
pyopenjtalk.OPEN_JTALK_DICT_DIR.decode("utf-8"),
|
||||
os.path.join("TEMP", "ja", "open_jtalk_dic"),
|
||||
)
|
||||
OPEN_JTALK_DICT_DIR = os.path.join("TEMP", "ja", "open_jtalk_dic")
|
||||
pyopenjtalk.OPEN_JTALK_DICT_DIR = OPEN_JTALK_DICT_DIR.encode("utf-8")
|
||||
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', current_file_path)):
|
||||
if (current_file_path[:len(python_dir)].upper() == python_dir.upper()):
|
||||
current_file_path = os.path.join(os.path.relpath(current_file_path,python_dir))
|
||||
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", current_file_path)):
|
||||
if current_file_path[: len(python_dir)].upper() == python_dir.upper():
|
||||
current_file_path = os.path.join(os.path.relpath(current_file_path, python_dir))
|
||||
else:
|
||||
if not os.path.exists('TEMP'):
|
||||
os.mkdir('TEMP')
|
||||
if not os.path.exists("TEMP"):
|
||||
os.mkdir("TEMP")
|
||||
if not os.path.exists(os.path.join("TEMP", "ja")):
|
||||
os.mkdir(os.path.join("TEMP", "ja"))
|
||||
if not os.path.exists(os.path.join("TEMP", "ja", "ja_userdic")):
|
||||
os.mkdir(os.path.join("TEMP", "ja", "ja_userdic"))
|
||||
shutil.copyfile(os.path.join(current_file_path, "ja_userdic", "userdict.csv"),os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"))
|
||||
shutil.copyfile(
|
||||
os.path.join(current_file_path, "ja_userdic", "userdict.csv"),
|
||||
os.path.join("TEMP", "ja", "ja_userdic", "userdict.csv"),
|
||||
)
|
||||
current_file_path = os.path.join("TEMP", "ja")
|
||||
|
||||
|
||||
def get_hash(fp: str) -> str:
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(fp, "rb") as f:
|
||||
@@ -51,21 +59,26 @@ try:
|
||||
USERDIC_HASH_PATH = os.path.join(current_file_path, "ja_userdic", "userdict.md5")
|
||||
# 如果没有用户词典,就生成一个;如果有,就检查md5,如果不一样,就重新生成
|
||||
if os.path.exists(USERDIC_CSV_PATH):
|
||||
if not os.path.exists(USERDIC_BIN_PATH) or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r",encoding='utf-8').read():
|
||||
if (
|
||||
not os.path.exists(USERDIC_BIN_PATH)
|
||||
or get_hash(USERDIC_CSV_PATH) != open(USERDIC_HASH_PATH, "r", encoding="utf-8").read()
|
||||
):
|
||||
pyopenjtalk.mecab_dict_index(USERDIC_CSV_PATH, USERDIC_BIN_PATH)
|
||||
with open(USERDIC_HASH_PATH, "w", encoding='utf-8') as f:
|
||||
with open(USERDIC_HASH_PATH, "w", encoding="utf-8") as f:
|
||||
f.write(get_hash(USERDIC_CSV_PATH))
|
||||
|
||||
if os.path.exists(USERDIC_BIN_PATH):
|
||||
pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
|
||||
except Exception as e:
|
||||
pyopenjtalk.update_global_jtalk_with_user_dict(USERDIC_BIN_PATH)
|
||||
except Exception:
|
||||
# print(e)
|
||||
import pyopenjtalk
|
||||
|
||||
# failed to load user dictionary, ignore.
|
||||
pass
|
||||
|
||||
|
||||
from text.symbols import punctuation
|
||||
|
||||
# Regular expression matching Japanese without punctuation marks:
|
||||
_japanese_characters = re.compile(
|
||||
r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
||||
@@ -123,9 +136,9 @@ def post_replace_ph(ph):
|
||||
|
||||
|
||||
def replace_consecutive_punctuation(text):
|
||||
punctuations = ''.join(re.escape(p) for p in punctuation)
|
||||
pattern = f'([{punctuations}])([{punctuations}])+'
|
||||
result = re.sub(pattern, r'\1', text)
|
||||
punctuations = "".join(re.escape(p) for p in punctuation)
|
||||
pattern = f"([{punctuations}])([{punctuations}])+"
|
||||
result = re.sub(pattern, r"\1", text)
|
||||
return result
|
||||
|
||||
|
||||
@@ -152,7 +165,7 @@ def preprocess_jap(text, with_prosody=False):
|
||||
text += p.split(" ")
|
||||
|
||||
if i < len(marks):
|
||||
if marks[i] == " ":# 防止意外的UNK
|
||||
if marks[i] == " ": # 防止意外的UNK
|
||||
continue
|
||||
text += [marks[i].replace(" ", "")]
|
||||
return text
|
||||
@@ -165,6 +178,7 @@ def text_normalize(text):
|
||||
text = replace_consecutive_punctuation(text)
|
||||
return text
|
||||
|
||||
|
||||
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
||||
def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
|
||||
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
|
||||
@@ -241,6 +255,7 @@ def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
|
||||
|
||||
return phones
|
||||
|
||||
|
||||
# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
||||
def _numeric_feature_by_regex(regex, s):
|
||||
match = re.search(regex, s)
|
||||
@@ -248,6 +263,7 @@ def _numeric_feature_by_regex(regex, s):
|
||||
return -50
|
||||
return int(match.group(1))
|
||||
|
||||
|
||||
def g2p(norm_text, with_prosody=True):
|
||||
phones = preprocess_jap(norm_text, with_prosody)
|
||||
phones = [post_replace_ph(i) for i in phones]
|
||||
|
||||
@@ -9,39 +9,43 @@ import importlib
|
||||
import os
|
||||
|
||||
# 防止win下无法读取模型
|
||||
if os.name == 'nt':
|
||||
if os.name == "nt":
|
||||
|
||||
class win_G2p(G2p):
|
||||
def check_mecab(self):
|
||||
super().check_mecab()
|
||||
spam_spec = importlib.util.find_spec("eunjeon")
|
||||
non_found = spam_spec is None
|
||||
if non_found:
|
||||
print(f'you have to install eunjeon. install it...')
|
||||
print("you have to install eunjeon. install it...")
|
||||
else:
|
||||
installpath = spam_spec.submodule_search_locations[0]
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
|
||||
|
||||
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
|
||||
import sys
|
||||
from eunjeon import Mecab as _Mecab
|
||||
|
||||
class Mecab(_Mecab):
|
||||
def get_dicpath(installpath):
|
||||
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
|
||||
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
|
||||
import shutil
|
||||
python_dir = os.getcwd()
|
||||
if (installpath[:len(python_dir)].upper() == python_dir.upper()):
|
||||
dicpath = os.path.join(os.path.relpath(installpath,python_dir),'data','mecabrc')
|
||||
else:
|
||||
if not os.path.exists('TEMP'):
|
||||
os.mkdir('TEMP')
|
||||
if not os.path.exists(os.path.join('TEMP', 'ko')):
|
||||
os.mkdir(os.path.join('TEMP', 'ko'))
|
||||
if os.path.exists(os.path.join('TEMP', 'ko', 'ko_dict')):
|
||||
shutil.rmtree(os.path.join('TEMP', 'ko', 'ko_dict'))
|
||||
|
||||
shutil.copytree(os.path.join(installpath, 'data'), os.path.join('TEMP', 'ko', 'ko_dict'))
|
||||
dicpath = os.path.join('TEMP', 'ko', 'ko_dict', 'mecabrc')
|
||||
python_dir = os.getcwd()
|
||||
if installpath[: len(python_dir)].upper() == python_dir.upper():
|
||||
dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc")
|
||||
else:
|
||||
if not os.path.exists("TEMP"):
|
||||
os.mkdir("TEMP")
|
||||
if not os.path.exists(os.path.join("TEMP", "ko")):
|
||||
os.mkdir(os.path.join("TEMP", "ko"))
|
||||
if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")):
|
||||
shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict"))
|
||||
|
||||
shutil.copytree(
|
||||
os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict")
|
||||
)
|
||||
dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc")
|
||||
else:
|
||||
dicpath=os.path.abspath(os.path.join(installpath, 'data/mecabrc'))
|
||||
dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc"))
|
||||
return dicpath
|
||||
|
||||
def __init__(self, dicpath=get_dicpath(installpath)):
|
||||
@@ -52,97 +56,108 @@ if os.name == 'nt':
|
||||
G2p = win_G2p
|
||||
|
||||
|
||||
from text.symbols2 import symbols
|
||||
from text.symbols2 import symbols
|
||||
|
||||
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
||||
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
|
||||
_korean_classifiers = (
|
||||
"군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통"
|
||||
)
|
||||
|
||||
# List of (hangul, hangul divided) pairs:
|
||||
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
# ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
|
||||
# ('ㄵ', 'ㄴㅈ'),
|
||||
# ('ㄶ', 'ㄴㅎ'),
|
||||
# ('ㄺ', 'ㄹㄱ'),
|
||||
# ('ㄻ', 'ㄹㅁ'),
|
||||
# ('ㄼ', 'ㄹㅂ'),
|
||||
# ('ㄽ', 'ㄹㅅ'),
|
||||
# ('ㄾ', 'ㄹㅌ'),
|
||||
# ('ㄿ', 'ㄹㅍ'),
|
||||
# ('ㅀ', 'ㄹㅎ'),
|
||||
# ('ㅄ', 'ㅂㅅ'),
|
||||
('ㅘ', 'ㅗㅏ'),
|
||||
('ㅙ', 'ㅗㅐ'),
|
||||
('ㅚ', 'ㅗㅣ'),
|
||||
('ㅝ', 'ㅜㅓ'),
|
||||
('ㅞ', 'ㅜㅔ'),
|
||||
('ㅟ', 'ㅜㅣ'),
|
||||
('ㅢ', 'ㅡㅣ'),
|
||||
('ㅑ', 'ㅣㅏ'),
|
||||
('ㅒ', 'ㅣㅐ'),
|
||||
('ㅕ', 'ㅣㅓ'),
|
||||
('ㅖ', 'ㅣㅔ'),
|
||||
('ㅛ', 'ㅣㅗ'),
|
||||
('ㅠ', 'ㅣㅜ')
|
||||
]]
|
||||
_hangul_divided = [
|
||||
(re.compile("%s" % x[0]), x[1])
|
||||
for x in [
|
||||
# ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
|
||||
# ('ㄵ', 'ㄴㅈ'),
|
||||
# ('ㄶ', 'ㄴㅎ'),
|
||||
# ('ㄺ', 'ㄹㄱ'),
|
||||
# ('ㄻ', 'ㄹㅁ'),
|
||||
# ('ㄼ', 'ㄹㅂ'),
|
||||
# ('ㄽ', 'ㄹㅅ'),
|
||||
# ('ㄾ', 'ㄹㅌ'),
|
||||
# ('ㄿ', 'ㄹㅍ'),
|
||||
# ('ㅀ', 'ㄹㅎ'),
|
||||
# ('ㅄ', 'ㅂㅅ'),
|
||||
("ㅘ", "ㅗㅏ"),
|
||||
("ㅙ", "ㅗㅐ"),
|
||||
("ㅚ", "ㅗㅣ"),
|
||||
("ㅝ", "ㅜㅓ"),
|
||||
("ㅞ", "ㅜㅔ"),
|
||||
("ㅟ", "ㅜㅣ"),
|
||||
("ㅢ", "ㅡㅣ"),
|
||||
("ㅑ", "ㅣㅏ"),
|
||||
("ㅒ", "ㅣㅐ"),
|
||||
("ㅕ", "ㅣㅓ"),
|
||||
("ㅖ", "ㅣㅔ"),
|
||||
("ㅛ", "ㅣㅗ"),
|
||||
("ㅠ", "ㅣㅜ"),
|
||||
]
|
||||
]
|
||||
|
||||
# List of (Latin alphabet, hangul) pairs:
|
||||
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('a', '에이'),
|
||||
('b', '비'),
|
||||
('c', '시'),
|
||||
('d', '디'),
|
||||
('e', '이'),
|
||||
('f', '에프'),
|
||||
('g', '지'),
|
||||
('h', '에이치'),
|
||||
('i', '아이'),
|
||||
('j', '제이'),
|
||||
('k', '케이'),
|
||||
('l', '엘'),
|
||||
('m', '엠'),
|
||||
('n', '엔'),
|
||||
('o', '오'),
|
||||
('p', '피'),
|
||||
('q', '큐'),
|
||||
('r', '아르'),
|
||||
('s', '에스'),
|
||||
('t', '티'),
|
||||
('u', '유'),
|
||||
('v', '브이'),
|
||||
('w', '더블유'),
|
||||
('x', '엑스'),
|
||||
('y', '와이'),
|
||||
('z', '제트')
|
||||
]]
|
||||
_latin_to_hangul = [
|
||||
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("a", "에이"),
|
||||
("b", "비"),
|
||||
("c", "시"),
|
||||
("d", "디"),
|
||||
("e", "이"),
|
||||
("f", "에프"),
|
||||
("g", "지"),
|
||||
("h", "에이치"),
|
||||
("i", "아이"),
|
||||
("j", "제이"),
|
||||
("k", "케이"),
|
||||
("l", "엘"),
|
||||
("m", "엠"),
|
||||
("n", "엔"),
|
||||
("o", "오"),
|
||||
("p", "피"),
|
||||
("q", "큐"),
|
||||
("r", "아르"),
|
||||
("s", "에스"),
|
||||
("t", "티"),
|
||||
("u", "유"),
|
||||
("v", "브이"),
|
||||
("w", "더블유"),
|
||||
("x", "엑스"),
|
||||
("y", "와이"),
|
||||
("z", "제트"),
|
||||
]
|
||||
]
|
||||
|
||||
# List of (ipa, lazy ipa) pairs:
|
||||
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('t͡ɕ','ʧ'),
|
||||
('d͡ʑ','ʥ'),
|
||||
('ɲ','n^'),
|
||||
('ɕ','ʃ'),
|
||||
('ʷ','w'),
|
||||
('ɭ','l`'),
|
||||
('ʎ','ɾ'),
|
||||
('ɣ','ŋ'),
|
||||
('ɰ','ɯ'),
|
||||
('ʝ','j'),
|
||||
('ʌ','ə'),
|
||||
('ɡ','g'),
|
||||
('\u031a','#'),
|
||||
('\u0348','='),
|
||||
('\u031e',''),
|
||||
('\u0320',''),
|
||||
('\u0339','')
|
||||
]]
|
||||
_ipa_to_lazy_ipa = [
|
||||
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("t͡ɕ", "ʧ"),
|
||||
("d͡ʑ", "ʥ"),
|
||||
("ɲ", "n^"),
|
||||
("ɕ", "ʃ"),
|
||||
("ʷ", "w"),
|
||||
("ɭ", "l`"),
|
||||
("ʎ", "ɾ"),
|
||||
("ɣ", "ŋ"),
|
||||
("ɰ", "ɯ"),
|
||||
("ʝ", "j"),
|
||||
("ʌ", "ə"),
|
||||
("ɡ", "g"),
|
||||
("\u031a", "#"),
|
||||
("\u0348", "="),
|
||||
("\u031e", ""),
|
||||
("\u0320", ""),
|
||||
("\u0339", ""),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def fix_g2pk2_error(text):
|
||||
new_text = ""
|
||||
i = 0
|
||||
while i < len(text) - 4:
|
||||
if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ':
|
||||
new_text += text[i:i+3] + ' ' + 'ㄴ'
|
||||
if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "ㄹ":
|
||||
new_text += text[i : i + 3] + " " + "ㄴ"
|
||||
i += 5
|
||||
else:
|
||||
new_text += text[i]
|
||||
@@ -166,20 +181,20 @@ def divide_hangul(text):
|
||||
|
||||
|
||||
def hangul_number(num, sino=True):
|
||||
'''Reference https://github.com/Kyubyong/g2pK'''
|
||||
num = re.sub(',', '', num)
|
||||
"""Reference https://github.com/Kyubyong/g2pK"""
|
||||
num = re.sub(",", "", num)
|
||||
|
||||
if num == '0':
|
||||
return '영'
|
||||
if not sino and num == '20':
|
||||
return '스무'
|
||||
if num == "0":
|
||||
return "영"
|
||||
if not sino and num == "20":
|
||||
return "스무"
|
||||
|
||||
digits = '123456789'
|
||||
names = '일이삼사오육칠팔구'
|
||||
digits = "123456789"
|
||||
names = "일이삼사오육칠팔구"
|
||||
digit2name = {d: n for d, n in zip(digits, names)}
|
||||
|
||||
modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
|
||||
decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
|
||||
modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉"
|
||||
decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔"
|
||||
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
||||
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
||||
|
||||
@@ -188,75 +203,75 @@ def hangul_number(num, sino=True):
|
||||
i = len(num) - i - 1
|
||||
if sino:
|
||||
if i == 0:
|
||||
name = digit2name.get(digit, '')
|
||||
name = digit2name.get(digit, "")
|
||||
elif i == 1:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
name = name.replace('일십', '십')
|
||||
name = digit2name.get(digit, "") + "십"
|
||||
name = name.replace("일십", "십")
|
||||
else:
|
||||
if i == 0:
|
||||
name = digit2mod.get(digit, '')
|
||||
name = digit2mod.get(digit, "")
|
||||
elif i == 1:
|
||||
name = digit2dec.get(digit, '')
|
||||
if digit == '0':
|
||||
name = digit2dec.get(digit, "")
|
||||
if digit == "0":
|
||||
if i % 4 == 0:
|
||||
last_three = spelledout[-min(3, len(spelledout)):]
|
||||
if ''.join(last_three) == '':
|
||||
spelledout.append('')
|
||||
last_three = spelledout[-min(3, len(spelledout)) :]
|
||||
if "".join(last_three) == "":
|
||||
spelledout.append("")
|
||||
continue
|
||||
else:
|
||||
spelledout.append('')
|
||||
spelledout.append("")
|
||||
continue
|
||||
if i == 2:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
name = name.replace('일백', '백')
|
||||
name = digit2name.get(digit, "") + "백"
|
||||
name = name.replace("일백", "백")
|
||||
elif i == 3:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
name = name.replace('일천', '천')
|
||||
name = digit2name.get(digit, "") + "천"
|
||||
name = name.replace("일천", "천")
|
||||
elif i == 4:
|
||||
name = digit2name.get(digit, '') + '만'
|
||||
name = name.replace('일만', '만')
|
||||
name = digit2name.get(digit, "") + "만"
|
||||
name = name.replace("일만", "만")
|
||||
elif i == 5:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
name = name.replace('일십', '십')
|
||||
name = digit2name.get(digit, "") + "십"
|
||||
name = name.replace("일십", "십")
|
||||
elif i == 6:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
name = name.replace('일백', '백')
|
||||
name = digit2name.get(digit, "") + "백"
|
||||
name = name.replace("일백", "백")
|
||||
elif i == 7:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
name = name.replace('일천', '천')
|
||||
name = digit2name.get(digit, "") + "천"
|
||||
name = name.replace("일천", "천")
|
||||
elif i == 8:
|
||||
name = digit2name.get(digit, '') + '억'
|
||||
name = digit2name.get(digit, "") + "억"
|
||||
elif i == 9:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
name = digit2name.get(digit, "") + "십"
|
||||
elif i == 10:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
name = digit2name.get(digit, "") + "백"
|
||||
elif i == 11:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
name = digit2name.get(digit, "") + "천"
|
||||
elif i == 12:
|
||||
name = digit2name.get(digit, '') + '조'
|
||||
name = digit2name.get(digit, "") + "조"
|
||||
elif i == 13:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
name = digit2name.get(digit, "") + "십"
|
||||
elif i == 14:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
name = digit2name.get(digit, "") + "백"
|
||||
elif i == 15:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
name = digit2name.get(digit, "") + "천"
|
||||
spelledout.append(name)
|
||||
return ''.join(elem for elem in spelledout)
|
||||
return "".join(elem for elem in spelledout)
|
||||
|
||||
|
||||
def number_to_hangul(text):
|
||||
'''Reference https://github.com/Kyubyong/g2pK'''
|
||||
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
||||
"""Reference https://github.com/Kyubyong/g2pK"""
|
||||
tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text))
|
||||
for token in tokens:
|
||||
num, classifier = token
|
||||
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
||||
spelledout = hangul_number(num, sino=False)
|
||||
else:
|
||||
spelledout = hangul_number(num, sino=True)
|
||||
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
||||
text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}")
|
||||
# digit by digit for remaining digits
|
||||
digits = '0123456789'
|
||||
names = '영일이삼사오육칠팔구'
|
||||
digits = "0123456789"
|
||||
names = "영일이삼사오육칠팔구"
|
||||
for d, n in zip(digits, names):
|
||||
text = text.replace(d, n)
|
||||
return text
|
||||
@@ -265,19 +280,23 @@ def number_to_hangul(text):
|
||||
def korean_to_lazy_ipa(text):
|
||||
text = latin_to_hangul(text)
|
||||
text = number_to_hangul(text)
|
||||
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
||||
text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text)
|
||||
for regex, replacement in _ipa_to_lazy_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
_g2p=G2p()
|
||||
|
||||
_g2p = G2p()
|
||||
|
||||
|
||||
def korean_to_ipa(text):
|
||||
text = latin_to_hangul(text)
|
||||
text = number_to_hangul(text)
|
||||
text = _g2p(text)
|
||||
text = fix_g2pk2_error(text)
|
||||
text = korean_to_lazy_ipa(text)
|
||||
return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
|
||||
return text.replace("ʧ", "tʃ").replace("ʥ", "dʑ")
|
||||
|
||||
|
||||
def post_replace_ph(ph):
|
||||
rep_map = {
|
||||
@@ -301,12 +320,13 @@ def post_replace_ph(ph):
|
||||
ph = "停"
|
||||
return ph
|
||||
|
||||
|
||||
def g2p(text):
|
||||
text = latin_to_hangul(text)
|
||||
text = _g2p(text)
|
||||
text = divide_hangul(text)
|
||||
text = fix_g2pk2_error(text)
|
||||
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
||||
text = re.sub(r"([\u3131-\u3163])$", r"\1.", text)
|
||||
# text = "".join([post_replace_ph(i) for i in text])
|
||||
text = [post_replace_ph(i) for i in text]
|
||||
return text
|
||||
@@ -314,4 +334,4 @@ def g2p(text):
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "안녕하세요"
|
||||
print(g2p(text))
|
||||
print(g2p(text))
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import os
|
||||
|
||||
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
|
||||
punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
|
||||
punctuation.append("-")
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import os
|
||||
|
||||
# punctuation = ['!', '?', '…', ",", ".","@"]#@是SP停顿
|
||||
punctuation = ["!", "?", "…", ",", "."] # @是SP停顿
|
||||
punctuation.append("-")
|
||||
@@ -396,24 +394,404 @@ arpa = {
|
||||
"SH",
|
||||
}
|
||||
|
||||
ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停'
|
||||
ko_symbols = "ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ空停"
|
||||
# ko_symbols='ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
|
||||
|
||||
yue_symbols={'Yeot3', 'Yip1', 'Yyu3', 'Yeng4', 'Yut5', 'Yaan5', 'Ym5', 'Yaan6', 'Yang1', 'Yun4', 'Yon2', 'Yui5', 'Yun2', 'Yat3', 'Ye', 'Yeot1', 'Yoeng5', 'Yoek2', 'Yam2', 'Yeon6', 'Yu6', 'Yiu3', 'Yaang6', 'Yp5', 'Yai4', 'Yoek4', 'Yit6', 'Yam5', 'Yoeng6', 'Yg1', 'Yk3', 'Yoe4', 'Yam3', 'Yc', 'Yyu4', 'Yyut1', 'Yiu4', 'Ying3', 'Yip3', 'Yaap3', 'Yau3', 'Yan4', 'Yau1', 'Yap4', 'Yk6', 'Yok3', 'Yai1', 'Yeot6', 'Yan2', 'Yoek6', 'Yt1', 'Yoi1', 'Yit5', 'Yn4', 'Yaau3', 'Yau4', 'Yuk6', 'Ys', 'Yuk', 'Yin6', 'Yung6', 'Ya', 'You', 'Yaai5', 'Yau5', 'Yoi3', 'Yaak3', 'Yaat3', 'Ying2', 'Yok5', 'Yeng2', 'Yyut3', 'Yam1', 'Yip5', 'You1', 'Yam6', 'Yaa5', 'Yi6', 'Yek4', 'Yyu2', 'Yuk5', 'Yaam1', 'Yang2', 'Yai', 'Yiu6', 'Yin4', 'Yok4', 'Yot3', 'Yui2', 'Yeoi5', 'Yyun6', 'Yyu5', 'Yoi5', 'Yeot2', 'Yim4', 'Yeoi2', 'Yaan1', 'Yang6', 'Yong1', 'Yaang4', 'Yung5', 'Yeon1', 'Yin2', 'Ya3', 'Yaang3', 'Yg', 'Yk2', 'Yaau5', 'Yut1', 'Yt5', 'Yip4', 'Yung4', 'Yj', 'Yong3', 'Ya1', 'Yg6', 'Yaau6', 'Yit3', 'Yun3', 'Ying1', 'Yn2', 'Yg4', 'Yl', 'Yp3', 'Yn3', 'Yak1', 'Yang5', 'Yoe6', 'You2', 'Yap2', 'Yak2', 'Yt3', 'Yot5', 'Yim2', 'Yi1', 'Yn6', 'Yaat5', 'Yaam3', 'Yoek5', 'Ye3', 'Yeon4', 'Yaa2', 'Yu3', 'Yim6', 'Ym', 'Yoe3', 'Yaai2', 'Ym2', 'Ya6', 'Yeng6', 'Yik4', 'Yot4', 'Yaai4', 'Yyun3', 'Yu1', 'Yoeng1', 'Yaap2', 'Yuk3', 'Yoek3', 'Yeng5', 'Yeoi1', 'Yiu2', 'Yok1', 'Yo1', 'Yoek1', 'Yoeng2', 'Yeon5', 'Yiu1', 'Yoeng4', 'Yuk2', 'Yat4', 'Yg5', 'Yut4', 'Yan6', 'Yin3', 'Yaa6', 'Yap1', 'Yg2', 'Yoe5', 'Yt4', 'Ya5', 'Yo4', 'Yyu1', 'Yak3', 'Yeon2', 'Yong4', 'Ym1', 'Ye2', 'Yaang5', 'Yoi2', 'Yeng3', 'Yn', 'Yyut4', 'Yau', 'Yaak2', 'Yaan4', 'Yek2', 'Yin1', 'Yi5', 'Yoe2', 'Yei5', 'Yaat6', 'Yak5', 'Yp6', 'Yok6', 'Yei2', 'Yaap1', 'Yyut5', 'Yi4', 'Yim1', 'Yk5', 'Ye4', 'Yok2', 'Yaam6', 'Yat2', 'Yon6', 'Yei3', 'Yyu6', 'Yeot5', 'Yk4', 'Yai6', 'Yd', 'Yg3', 'Yei6', 'Yau2', 'Yok', 'Yau6', 'Yung3', 'Yim5', 'Yut6', 'Yit1', 'Yon3', 'Yat1', 'Yaam2', 'Yyut2', 'Yui6', 'Yt2', 'Yek6', 'Yt', 'Ye6', 'Yang3', 'Ying6', 'Yaau1', 'Yeon3', 'Yng', 'Yh', 'Yang4', 'Ying5', 'Yaap6', 'Yoeng3', 'Yyun4', 'You3', 'Yan5', 'Yat5', 'Yot1', 'Yun1', 'Yi3', 'Yaa1', 'Yaap4', 'You6', 'Yaang2', 'Yaap5', 'Yaa3', 'Yaak6', 'Yeng1', 'Yaak1', 'Yo5', 'Yoi4', 'Yam4', 'Yik1', 'Ye1', 'Yai5', 'Yung1', 'Yp2', 'Yui4', 'Yaak4', 'Yung2', 'Yak4', 'Yaat4', 'Yeoi4', 'Yut2', 'Yin5', 'Yaau4', 'Yap6', 'Yb', 'Yaam4', 'Yw', 'Yut3', 'Yong2', 'Yt6', 'Yaai6', 'Yap5', 'Yik5', 'Yun6', 'Yaam5', 'Yun5', 'Yik3', 'Ya2', 'Yyut6', 'Yon4', 'Yk1', 'Yit4', 'Yak6', 'Yaan2', 'Yuk1', 'Yai2', 'Yik2', 'Yaat2', 'Yo3', 'Ykw', 'Yn5', 'Yaa', 'Ye5', 'Yu4', 'Yei1', 'Yai3', 'Yyun5', 'Yip2', 'Yaau2', 'Yiu5', 'Ym4', 'Yeoi6', 'Yk', 'Ym6', 'Yoe1', 'Yeoi3', 'Yon', 'Yuk4', 'Yaai3', 'Yaa4', 'Yot6', 'Yaang1', 'Yei4', 'Yek1', 'Yo', 'Yp', 'Yo6', 'Yp4', 'Yan3', 'Yoi', 'Yap3', 'Yek3', 'Yim3', 'Yz', 'Yot2', 'Yoi6', 'Yit2', 'Yu5', 'Yaan3', 'Yan1', 'Yon5', 'Yp1', 'Yong5', 'Ygw', 'Yak', 'Yat6', 'Ying4', 'Yu2', 'Yf', 'Ya4', 'Yon1', 'You4', 'Yik6', 'Yui1', 'Yaat1', 'Yeot4', 'Yi2', 'Yaai1', 'Yek5', 'Ym3', 'Yong6', 'You5', 'Yyun1', 'Yn1', 'Yo2', 'Yip6', 'Yui3', 'Yaak5', 'Yyun2'}
|
||||
yue_symbols = {
|
||||
"Yeot3",
|
||||
"Yip1",
|
||||
"Yyu3",
|
||||
"Yeng4",
|
||||
"Yut5",
|
||||
"Yaan5",
|
||||
"Ym5",
|
||||
"Yaan6",
|
||||
"Yang1",
|
||||
"Yun4",
|
||||
"Yon2",
|
||||
"Yui5",
|
||||
"Yun2",
|
||||
"Yat3",
|
||||
"Ye",
|
||||
"Yeot1",
|
||||
"Yoeng5",
|
||||
"Yoek2",
|
||||
"Yam2",
|
||||
"Yeon6",
|
||||
"Yu6",
|
||||
"Yiu3",
|
||||
"Yaang6",
|
||||
"Yp5",
|
||||
"Yai4",
|
||||
"Yoek4",
|
||||
"Yit6",
|
||||
"Yam5",
|
||||
"Yoeng6",
|
||||
"Yg1",
|
||||
"Yk3",
|
||||
"Yoe4",
|
||||
"Yam3",
|
||||
"Yc",
|
||||
"Yyu4",
|
||||
"Yyut1",
|
||||
"Yiu4",
|
||||
"Ying3",
|
||||
"Yip3",
|
||||
"Yaap3",
|
||||
"Yau3",
|
||||
"Yan4",
|
||||
"Yau1",
|
||||
"Yap4",
|
||||
"Yk6",
|
||||
"Yok3",
|
||||
"Yai1",
|
||||
"Yeot6",
|
||||
"Yan2",
|
||||
"Yoek6",
|
||||
"Yt1",
|
||||
"Yoi1",
|
||||
"Yit5",
|
||||
"Yn4",
|
||||
"Yaau3",
|
||||
"Yau4",
|
||||
"Yuk6",
|
||||
"Ys",
|
||||
"Yuk",
|
||||
"Yin6",
|
||||
"Yung6",
|
||||
"Ya",
|
||||
"You",
|
||||
"Yaai5",
|
||||
"Yau5",
|
||||
"Yoi3",
|
||||
"Yaak3",
|
||||
"Yaat3",
|
||||
"Ying2",
|
||||
"Yok5",
|
||||
"Yeng2",
|
||||
"Yyut3",
|
||||
"Yam1",
|
||||
"Yip5",
|
||||
"You1",
|
||||
"Yam6",
|
||||
"Yaa5",
|
||||
"Yi6",
|
||||
"Yek4",
|
||||
"Yyu2",
|
||||
"Yuk5",
|
||||
"Yaam1",
|
||||
"Yang2",
|
||||
"Yai",
|
||||
"Yiu6",
|
||||
"Yin4",
|
||||
"Yok4",
|
||||
"Yot3",
|
||||
"Yui2",
|
||||
"Yeoi5",
|
||||
"Yyun6",
|
||||
"Yyu5",
|
||||
"Yoi5",
|
||||
"Yeot2",
|
||||
"Yim4",
|
||||
"Yeoi2",
|
||||
"Yaan1",
|
||||
"Yang6",
|
||||
"Yong1",
|
||||
"Yaang4",
|
||||
"Yung5",
|
||||
"Yeon1",
|
||||
"Yin2",
|
||||
"Ya3",
|
||||
"Yaang3",
|
||||
"Yg",
|
||||
"Yk2",
|
||||
"Yaau5",
|
||||
"Yut1",
|
||||
"Yt5",
|
||||
"Yip4",
|
||||
"Yung4",
|
||||
"Yj",
|
||||
"Yong3",
|
||||
"Ya1",
|
||||
"Yg6",
|
||||
"Yaau6",
|
||||
"Yit3",
|
||||
"Yun3",
|
||||
"Ying1",
|
||||
"Yn2",
|
||||
"Yg4",
|
||||
"Yl",
|
||||
"Yp3",
|
||||
"Yn3",
|
||||
"Yak1",
|
||||
"Yang5",
|
||||
"Yoe6",
|
||||
"You2",
|
||||
"Yap2",
|
||||
"Yak2",
|
||||
"Yt3",
|
||||
"Yot5",
|
||||
"Yim2",
|
||||
"Yi1",
|
||||
"Yn6",
|
||||
"Yaat5",
|
||||
"Yaam3",
|
||||
"Yoek5",
|
||||
"Ye3",
|
||||
"Yeon4",
|
||||
"Yaa2",
|
||||
"Yu3",
|
||||
"Yim6",
|
||||
"Ym",
|
||||
"Yoe3",
|
||||
"Yaai2",
|
||||
"Ym2",
|
||||
"Ya6",
|
||||
"Yeng6",
|
||||
"Yik4",
|
||||
"Yot4",
|
||||
"Yaai4",
|
||||
"Yyun3",
|
||||
"Yu1",
|
||||
"Yoeng1",
|
||||
"Yaap2",
|
||||
"Yuk3",
|
||||
"Yoek3",
|
||||
"Yeng5",
|
||||
"Yeoi1",
|
||||
"Yiu2",
|
||||
"Yok1",
|
||||
"Yo1",
|
||||
"Yoek1",
|
||||
"Yoeng2",
|
||||
"Yeon5",
|
||||
"Yiu1",
|
||||
"Yoeng4",
|
||||
"Yuk2",
|
||||
"Yat4",
|
||||
"Yg5",
|
||||
"Yut4",
|
||||
"Yan6",
|
||||
"Yin3",
|
||||
"Yaa6",
|
||||
"Yap1",
|
||||
"Yg2",
|
||||
"Yoe5",
|
||||
"Yt4",
|
||||
"Ya5",
|
||||
"Yo4",
|
||||
"Yyu1",
|
||||
"Yak3",
|
||||
"Yeon2",
|
||||
"Yong4",
|
||||
"Ym1",
|
||||
"Ye2",
|
||||
"Yaang5",
|
||||
"Yoi2",
|
||||
"Yeng3",
|
||||
"Yn",
|
||||
"Yyut4",
|
||||
"Yau",
|
||||
"Yaak2",
|
||||
"Yaan4",
|
||||
"Yek2",
|
||||
"Yin1",
|
||||
"Yi5",
|
||||
"Yoe2",
|
||||
"Yei5",
|
||||
"Yaat6",
|
||||
"Yak5",
|
||||
"Yp6",
|
||||
"Yok6",
|
||||
"Yei2",
|
||||
"Yaap1",
|
||||
"Yyut5",
|
||||
"Yi4",
|
||||
"Yim1",
|
||||
"Yk5",
|
||||
"Ye4",
|
||||
"Yok2",
|
||||
"Yaam6",
|
||||
"Yat2",
|
||||
"Yon6",
|
||||
"Yei3",
|
||||
"Yyu6",
|
||||
"Yeot5",
|
||||
"Yk4",
|
||||
"Yai6",
|
||||
"Yd",
|
||||
"Yg3",
|
||||
"Yei6",
|
||||
"Yau2",
|
||||
"Yok",
|
||||
"Yau6",
|
||||
"Yung3",
|
||||
"Yim5",
|
||||
"Yut6",
|
||||
"Yit1",
|
||||
"Yon3",
|
||||
"Yat1",
|
||||
"Yaam2",
|
||||
"Yyut2",
|
||||
"Yui6",
|
||||
"Yt2",
|
||||
"Yek6",
|
||||
"Yt",
|
||||
"Ye6",
|
||||
"Yang3",
|
||||
"Ying6",
|
||||
"Yaau1",
|
||||
"Yeon3",
|
||||
"Yng",
|
||||
"Yh",
|
||||
"Yang4",
|
||||
"Ying5",
|
||||
"Yaap6",
|
||||
"Yoeng3",
|
||||
"Yyun4",
|
||||
"You3",
|
||||
"Yan5",
|
||||
"Yat5",
|
||||
"Yot1",
|
||||
"Yun1",
|
||||
"Yi3",
|
||||
"Yaa1",
|
||||
"Yaap4",
|
||||
"You6",
|
||||
"Yaang2",
|
||||
"Yaap5",
|
||||
"Yaa3",
|
||||
"Yaak6",
|
||||
"Yeng1",
|
||||
"Yaak1",
|
||||
"Yo5",
|
||||
"Yoi4",
|
||||
"Yam4",
|
||||
"Yik1",
|
||||
"Ye1",
|
||||
"Yai5",
|
||||
"Yung1",
|
||||
"Yp2",
|
||||
"Yui4",
|
||||
"Yaak4",
|
||||
"Yung2",
|
||||
"Yak4",
|
||||
"Yaat4",
|
||||
"Yeoi4",
|
||||
"Yut2",
|
||||
"Yin5",
|
||||
"Yaau4",
|
||||
"Yap6",
|
||||
"Yb",
|
||||
"Yaam4",
|
||||
"Yw",
|
||||
"Yut3",
|
||||
"Yong2",
|
||||
"Yt6",
|
||||
"Yaai6",
|
||||
"Yap5",
|
||||
"Yik5",
|
||||
"Yun6",
|
||||
"Yaam5",
|
||||
"Yun5",
|
||||
"Yik3",
|
||||
"Ya2",
|
||||
"Yyut6",
|
||||
"Yon4",
|
||||
"Yk1",
|
||||
"Yit4",
|
||||
"Yak6",
|
||||
"Yaan2",
|
||||
"Yuk1",
|
||||
"Yai2",
|
||||
"Yik2",
|
||||
"Yaat2",
|
||||
"Yo3",
|
||||
"Ykw",
|
||||
"Yn5",
|
||||
"Yaa",
|
||||
"Ye5",
|
||||
"Yu4",
|
||||
"Yei1",
|
||||
"Yai3",
|
||||
"Yyun5",
|
||||
"Yip2",
|
||||
"Yaau2",
|
||||
"Yiu5",
|
||||
"Ym4",
|
||||
"Yeoi6",
|
||||
"Yk",
|
||||
"Ym6",
|
||||
"Yoe1",
|
||||
"Yeoi3",
|
||||
"Yon",
|
||||
"Yuk4",
|
||||
"Yaai3",
|
||||
"Yaa4",
|
||||
"Yot6",
|
||||
"Yaang1",
|
||||
"Yei4",
|
||||
"Yek1",
|
||||
"Yo",
|
||||
"Yp",
|
||||
"Yo6",
|
||||
"Yp4",
|
||||
"Yan3",
|
||||
"Yoi",
|
||||
"Yap3",
|
||||
"Yek3",
|
||||
"Yim3",
|
||||
"Yz",
|
||||
"Yot2",
|
||||
"Yoi6",
|
||||
"Yit2",
|
||||
"Yu5",
|
||||
"Yaan3",
|
||||
"Yan1",
|
||||
"Yon5",
|
||||
"Yp1",
|
||||
"Yong5",
|
||||
"Ygw",
|
||||
"Yak",
|
||||
"Yat6",
|
||||
"Ying4",
|
||||
"Yu2",
|
||||
"Yf",
|
||||
"Ya4",
|
||||
"Yon1",
|
||||
"You4",
|
||||
"Yik6",
|
||||
"Yui1",
|
||||
"Yaat1",
|
||||
"Yeot4",
|
||||
"Yi2",
|
||||
"Yaai1",
|
||||
"Yek5",
|
||||
"Ym3",
|
||||
"Yong6",
|
||||
"You5",
|
||||
"Yyun1",
|
||||
"Yn1",
|
||||
"Yo2",
|
||||
"Yip6",
|
||||
"Yui3",
|
||||
"Yaak5",
|
||||
"Yyun2",
|
||||
}
|
||||
|
||||
# symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)+list(ko_symbols)#+list(yue_symbols)###直接这么加yue顺序乱了
|
||||
symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa)
|
||||
symbols = sorted(set(symbols))
|
||||
# print(len(symbols))
|
||||
symbols+=["[","]"]##日文新增上升下降调型
|
||||
symbols+=sorted(list(ko_symbols))
|
||||
symbols+=sorted(list(yue_symbols))##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复
|
||||
symbols += ["[", "]"] ##日文新增上升下降调型
|
||||
symbols += sorted(list(ko_symbols))
|
||||
symbols += sorted(list(yue_symbols)) ##新加的yue统一摆在后头#已查过开头加Y后没有重复,韩文显然不会重复
|
||||
# print(len(symbols))
|
||||
if __name__ == "__main__":
|
||||
print(len(symbols))
|
||||
'''
|
||||
"""
|
||||
粤语:
|
||||
732-353=379
|
||||
韩文+粤语:
|
||||
732-322=410
|
||||
'''
|
||||
"""
|
||||
|
||||
@@ -510,12 +510,7 @@ class ToneSandhi:
|
||||
# e.g. 走了, 看着, 去过
|
||||
elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
|
||||
finals[-1] = finals[-1][:-1] + "5"
|
||||
elif (
|
||||
len(word) > 1
|
||||
and word[-1] in "们子"
|
||||
and pos in {"r", "n"}
|
||||
and word not in self.must_not_neural_tone_words
|
||||
):
|
||||
elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"} and word not in self.must_not_neural_tone_words:
|
||||
finals[-1] = finals[-1][:-1] + "5"
|
||||
# e.g. 桌上, 地下, 家里
|
||||
elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
|
||||
@@ -525,25 +520,18 @@ class ToneSandhi:
|
||||
finals[-1] = finals[-1][:-1] + "5"
|
||||
# 个做量词
|
||||
elif (
|
||||
ge_idx >= 1
|
||||
and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
|
||||
ge_idx >= 1 and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
|
||||
) or word == "个":
|
||||
finals[ge_idx] = finals[ge_idx][:-1] + "5"
|
||||
else:
|
||||
if (
|
||||
word in self.must_neural_tone_words
|
||||
or word[-2:] in self.must_neural_tone_words
|
||||
):
|
||||
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
|
||||
finals[-1] = finals[-1][:-1] + "5"
|
||||
|
||||
word_list = self._split_word(word)
|
||||
finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
|
||||
for i, word in enumerate(word_list):
|
||||
# conventional neural in Chinese
|
||||
if (
|
||||
word in self.must_neural_tone_words
|
||||
or word[-2:] in self.must_neural_tone_words
|
||||
):
|
||||
if word in self.must_neural_tone_words or word[-2:] in self.must_neural_tone_words:
|
||||
finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
|
||||
finals = sum(finals_list, [])
|
||||
return finals
|
||||
@@ -561,9 +549,7 @@ class ToneSandhi:
|
||||
|
||||
def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
|
||||
# "一" in number sequences, e.g. 一零零, 二一零
|
||||
if word.find("一") != -1 and all(
|
||||
[item.isnumeric() for item in word if item != "一"]
|
||||
):
|
||||
if word.find("一") != -1 and all([item.isnumeric() for item in word if item != "一"]):
|
||||
return finals
|
||||
# "一" between reduplication words shold be yi5, e.g. 看一看
|
||||
elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
|
||||
@@ -697,13 +683,10 @@ class ToneSandhi:
|
||||
return new_seg
|
||||
|
||||
# the first and the second words are all_tone_three
|
||||
def _merge_continuous_three_tones(
|
||||
self, seg: List[Tuple[str, str]]
|
||||
) -> List[Tuple[str, str]]:
|
||||
def _merge_continuous_three_tones(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||||
new_seg = []
|
||||
sub_finals_list = [
|
||||
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
for (word, pos) in seg
|
||||
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
|
||||
]
|
||||
assert len(sub_finals_list) == len(seg)
|
||||
merge_last = [False] * len(seg)
|
||||
@@ -715,10 +698,7 @@ class ToneSandhi:
|
||||
and not merge_last[i - 1]
|
||||
):
|
||||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||||
if (
|
||||
not self._is_reduplication(seg[i - 1][0])
|
||||
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
|
||||
):
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||||
merge_last[i] = True
|
||||
else:
|
||||
@@ -732,13 +712,10 @@ class ToneSandhi:
|
||||
return len(word) == 2 and word[0] == word[1]
|
||||
|
||||
# the last char of first word and the first char of second word is tone_three
|
||||
def _merge_continuous_three_tones_2(
|
||||
self, seg: List[Tuple[str, str]]
|
||||
) -> List[Tuple[str, str]]:
|
||||
def _merge_continuous_three_tones_2(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
||||
new_seg = []
|
||||
sub_finals_list = [
|
||||
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
||||
for (word, pos) in seg
|
||||
lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) for (word, pos) in seg
|
||||
]
|
||||
assert len(sub_finals_list) == len(seg)
|
||||
merge_last = [False] * len(seg)
|
||||
@@ -750,10 +727,7 @@ class ToneSandhi:
|
||||
and not merge_last[i - 1]
|
||||
):
|
||||
# if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
|
||||
if (
|
||||
not self._is_reduplication(seg[i - 1][0])
|
||||
and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
|
||||
):
|
||||
if not self._is_reduplication(seg[i - 1][0]) and len(seg[i - 1][0]) + len(seg[i][0]) <= 3:
|
||||
new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
|
||||
merge_last[i] = True
|
||||
else:
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -21,25 +21,29 @@ from .num import verbalize_digit
|
||||
|
||||
def _time_num2str(num_string: str) -> str:
|
||||
"""A special case for verbalizing number in time."""
|
||||
result = num2str(num_string.lstrip('0'))
|
||||
if num_string.startswith('0'):
|
||||
result = DIGITS['0'] + result
|
||||
result = num2str(num_string.lstrip("0"))
|
||||
if num_string.startswith("0"):
|
||||
result = DIGITS["0"] + result
|
||||
return result
|
||||
|
||||
|
||||
# 时刻表达式
|
||||
RE_TIME = re.compile(r'([0-1]?[0-9]|2[0-3])'
|
||||
r':([0-5][0-9])'
|
||||
r'(:([0-5][0-9]))?')
|
||||
RE_TIME = re.compile(
|
||||
r"([0-1]?[0-9]|2[0-3])"
|
||||
r":([0-5][0-9])"
|
||||
r"(:([0-5][0-9]))?"
|
||||
)
|
||||
|
||||
# 时间范围,如8:30-12:30
|
||||
RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])'
|
||||
r':([0-5][0-9])'
|
||||
r'(:([0-5][0-9]))?'
|
||||
r'(~|-)'
|
||||
r'([0-1]?[0-9]|2[0-3])'
|
||||
r':([0-5][0-9])'
|
||||
r'(:([0-5][0-9]))?')
|
||||
RE_TIME_RANGE = re.compile(
|
||||
r"([0-1]?[0-9]|2[0-3])"
|
||||
r":([0-5][0-9])"
|
||||
r"(:([0-5][0-9]))?"
|
||||
r"(~|-)"
|
||||
r"([0-1]?[0-9]|2[0-3])"
|
||||
r":([0-5][0-9])"
|
||||
r"(:([0-5][0-9]))?"
|
||||
)
|
||||
|
||||
|
||||
def replace_time(match) -> str:
|
||||
@@ -62,31 +66,33 @@ def replace_time(match) -> str:
|
||||
second_2 = match.group(9)
|
||||
|
||||
result = f"{num2str(hour)}点"
|
||||
if minute.lstrip('0'):
|
||||
if minute.lstrip("0"):
|
||||
if int(minute) == 30:
|
||||
result += "半"
|
||||
else:
|
||||
result += f"{_time_num2str(minute)}分"
|
||||
if second and second.lstrip('0'):
|
||||
if second and second.lstrip("0"):
|
||||
result += f"{_time_num2str(second)}秒"
|
||||
|
||||
if is_range:
|
||||
result += "至"
|
||||
result += f"{num2str(hour_2)}点"
|
||||
if minute_2.lstrip('0'):
|
||||
if minute_2.lstrip("0"):
|
||||
if int(minute) == 30:
|
||||
result += "半"
|
||||
else:
|
||||
result += f"{_time_num2str(minute_2)}分"
|
||||
if second_2 and second_2.lstrip('0'):
|
||||
if second_2 and second_2.lstrip("0"):
|
||||
result += f"{_time_num2str(second_2)}秒"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
RE_DATE = re.compile(r'(\d{4}|\d{2})年'
|
||||
r'((0?[1-9]|1[0-2])月)?'
|
||||
r'(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?')
|
||||
RE_DATE = re.compile(
|
||||
r"(\d{4}|\d{2})年"
|
||||
r"((0?[1-9]|1[0-2])月)?"
|
||||
r"(((0?[1-9])|((1|2)[0-9])|30|31)([日号]))?"
|
||||
)
|
||||
|
||||
|
||||
def replace_date(match) -> str:
|
||||
@@ -110,8 +116,7 @@ def replace_date(match) -> str:
|
||||
|
||||
|
||||
# 用 / 或者 - 分隔的 YY/MM/DD 或者 YY-MM-DD 日期
|
||||
RE_DATE2 = re.compile(
|
||||
r'(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])')
|
||||
RE_DATE2 = re.compile(r"(\d{4})([- /.])(0[1-9]|1[012])\2(0[1-9]|[12][0-9]|3[01])")
|
||||
|
||||
|
||||
def replace_date2(match) -> str:
|
||||
|
||||
@@ -18,10 +18,7 @@ from pypinyin.constants import SUPPORT_UCS4
|
||||
|
||||
# 全角半角转换
|
||||
# 英文字符全角 -> 半角映射表 (num: 52)
|
||||
F2H_ASCII_LETTERS = {
|
||||
ord(char) + 65248: ord(char)
|
||||
for char in string.ascii_letters
|
||||
}
|
||||
F2H_ASCII_LETTERS = {ord(char) + 65248: ord(char) for char in string.ascii_letters}
|
||||
|
||||
# 英文字符半角 -> 全角映射表
|
||||
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
|
||||
@@ -37,26 +34,29 @@ F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
|
||||
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
|
||||
|
||||
# 空格 (num: 1)
|
||||
F2H_SPACE = {'\u3000': ' '}
|
||||
H2F_SPACE = {' ': '\u3000'}
|
||||
F2H_SPACE = {"\u3000": " "}
|
||||
H2F_SPACE = {" ": "\u3000"}
|
||||
|
||||
# 非"有拼音的汉字"的字符串,可用于NSW提取
|
||||
if SUPPORT_UCS4:
|
||||
RE_NSW = re.compile(r'(?:[^'
|
||||
r'\u3007' # 〇
|
||||
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
|
||||
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
|
||||
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
|
||||
r'\U00020000-\U0002A6DF' # CJK扩展B:[20000-2A6DF]
|
||||
r'\U0002A703-\U0002B73F' # CJK扩展C:[2A700-2B73F]
|
||||
r'\U0002B740-\U0002B81D' # CJK扩展D:[2B740-2B81D]
|
||||
r'\U0002F80A-\U0002FA1F' # CJK兼容扩展:[2F800-2FA1F]
|
||||
r'])+')
|
||||
RE_NSW = re.compile(
|
||||
r"(?:[^"
|
||||
r"\u3007" # 〇
|
||||
r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
|
||||
r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
|
||||
r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
|
||||
r"\U00020000-\U0002A6DF" # CJK扩展B:[20000-2A6DF]
|
||||
r"\U0002A703-\U0002B73F" # CJK扩展C:[2A700-2B73F]
|
||||
r"\U0002B740-\U0002B81D" # CJK扩展D:[2B740-2B81D]
|
||||
r"\U0002F80A-\U0002FA1F" # CJK兼容扩展:[2F800-2FA1F]
|
||||
r"])+"
|
||||
)
|
||||
else:
|
||||
RE_NSW = re.compile( # pragma: no cover
|
||||
r'(?:[^'
|
||||
r'\u3007' # 〇
|
||||
r'\u3400-\u4dbf' # CJK扩展A:[3400-4DBF]
|
||||
r'\u4e00-\u9fff' # CJK基本:[4E00-9FFF]
|
||||
r'\uf900-\ufaff' # CJK兼容:[F900-FAFF]
|
||||
r'])+')
|
||||
r"(?:[^"
|
||||
r"\u3007" # 〇
|
||||
r"\u3400-\u4dbf" # CJK扩展A:[3400-4DBF]
|
||||
r"\u4e00-\u9fff" # CJK基本:[4E00-9FFF]
|
||||
r"\uf900-\ufaff" # CJK兼容:[F900-FAFF]
|
||||
r"])+"
|
||||
)
|
||||
|
||||
@@ -15,23 +15,26 @@
|
||||
Rules to verbalize numbers into Chinese characters.
|
||||
https://zh.wikipedia.org/wiki/中文数字#現代中文
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from typing import List
|
||||
|
||||
DIGITS = {str(i): tran for i, tran in enumerate('零一二三四五六七八九')}
|
||||
UNITS = OrderedDict({
|
||||
1: '十',
|
||||
2: '百',
|
||||
3: '千',
|
||||
4: '万',
|
||||
8: '亿',
|
||||
})
|
||||
DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
|
||||
UNITS = OrderedDict(
|
||||
{
|
||||
1: "十",
|
||||
2: "百",
|
||||
3: "千",
|
||||
4: "万",
|
||||
8: "亿",
|
||||
}
|
||||
)
|
||||
|
||||
COM_QUANTIFIERS = '(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
|
||||
COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
|
||||
|
||||
# 分数表达式
|
||||
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
|
||||
RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
|
||||
|
||||
|
||||
def replace_frac(match) -> str:
|
||||
@@ -52,7 +55,7 @@ def replace_frac(match) -> str:
|
||||
|
||||
|
||||
# 百分数表达式
|
||||
RE_PERCENTAGE = re.compile(r'(-?)(\d+(\.\d+)?)%')
|
||||
RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
|
||||
|
||||
|
||||
def replace_percentage(match) -> str:
|
||||
@@ -72,7 +75,7 @@ def replace_percentage(match) -> str:
|
||||
|
||||
# 整数表达式
|
||||
# 带负号的整数 -10
|
||||
RE_INTEGER = re.compile(r'(-)' r'(\d+)')
|
||||
RE_INTEGER = re.compile(r"(-)" r"(\d+)")
|
||||
|
||||
|
||||
def replace_negative_num(match) -> str:
|
||||
@@ -92,7 +95,7 @@ def replace_negative_num(match) -> str:
|
||||
|
||||
# 编号-无符号整形
|
||||
# 00078
|
||||
RE_DEFAULT_NUM = re.compile(r'\d{3}\d*')
|
||||
RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
|
||||
|
||||
|
||||
def replace_default_num(match):
|
||||
@@ -110,15 +113,11 @@ def replace_default_num(match):
|
||||
# RE_ASMD = re.compile(
|
||||
# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
|
||||
RE_ASMD = re.compile(
|
||||
r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))')
|
||||
r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
|
||||
)
|
||||
|
||||
asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}
|
||||
|
||||
asmd_map = {
|
||||
'+': '加',
|
||||
'-': '减',
|
||||
'×': '乘',
|
||||
'÷': '除',
|
||||
'=': '等于'
|
||||
}
|
||||
|
||||
def replace_asmd(match) -> str:
|
||||
"""
|
||||
@@ -132,24 +131,25 @@ def replace_asmd(match) -> str:
|
||||
|
||||
|
||||
# 次方专项
|
||||
RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+')
|
||||
RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
|
||||
|
||||
power_map = {
|
||||
'⁰': '0',
|
||||
'¹': '1',
|
||||
'²': '2',
|
||||
'³': '3',
|
||||
'⁴': '4',
|
||||
'⁵': '5',
|
||||
'⁶': '6',
|
||||
'⁷': '7',
|
||||
'⁸': '8',
|
||||
'⁹': '9',
|
||||
'ˣ': 'x',
|
||||
'ʸ': 'y',
|
||||
'ⁿ': 'n'
|
||||
"⁰": "0",
|
||||
"¹": "1",
|
||||
"²": "2",
|
||||
"³": "3",
|
||||
"⁴": "4",
|
||||
"⁵": "5",
|
||||
"⁶": "6",
|
||||
"⁷": "7",
|
||||
"⁸": "8",
|
||||
"⁹": "9",
|
||||
"ˣ": "x",
|
||||
"ʸ": "y",
|
||||
"ⁿ": "n",
|
||||
}
|
||||
|
||||
|
||||
def replace_power(match) -> str:
|
||||
"""
|
||||
Args:
|
||||
@@ -166,10 +166,10 @@ def replace_power(match) -> str:
|
||||
|
||||
# 数字表达式
|
||||
# 纯小数
|
||||
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')
|
||||
RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
|
||||
# 正整数 + 量词
|
||||
RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
|
||||
RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))')
|
||||
RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
|
||||
|
||||
|
||||
def replace_positive_quantifier(match) -> str:
|
||||
@@ -220,7 +220,9 @@ RE_RANGE = re.compile(
|
||||
[-~] # 匹配范围分隔符
|
||||
((-?)((\d+)(\.\d+)?)) # 匹配范围结束的负数或正数(整数或小数)
|
||||
(?![\d\+\-\×÷=]) # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
|
||||
""", re.VERBOSE)
|
||||
""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
|
||||
def replace_range(match) -> str:
|
||||
@@ -239,7 +241,9 @@ def replace_range(match) -> str:
|
||||
|
||||
# ~至表达式
|
||||
RE_TO_RANGE = re.compile(
|
||||
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)')
|
||||
r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
|
||||
)
|
||||
|
||||
|
||||
def replace_to_range(match) -> str:
|
||||
"""
|
||||
@@ -248,71 +252,66 @@ def replace_to_range(match) -> str:
|
||||
Returns:
|
||||
str
|
||||
"""
|
||||
result = match.group(0).replace('~', '至')
|
||||
result = match.group(0).replace("~", "至")
|
||||
return result
|
||||
|
||||
|
||||
def _get_value(value_string: str, use_zero: bool=True) -> List[str]:
|
||||
stripped = value_string.lstrip('0')
|
||||
def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
|
||||
stripped = value_string.lstrip("0")
|
||||
if len(stripped) == 0:
|
||||
return []
|
||||
elif len(stripped) == 1:
|
||||
if use_zero and len(stripped) < len(value_string):
|
||||
return [DIGITS['0'], DIGITS[stripped]]
|
||||
return [DIGITS["0"], DIGITS[stripped]]
|
||||
else:
|
||||
return [DIGITS[stripped]]
|
||||
else:
|
||||
largest_unit = next(
|
||||
power for power in reversed(UNITS.keys()) if power < len(stripped))
|
||||
largest_unit = next(power for power in reversed(UNITS.keys()) if power < len(stripped))
|
||||
first_part = value_string[:-largest_unit]
|
||||
second_part = value_string[-largest_unit:]
|
||||
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(
|
||||
second_part)
|
||||
return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
|
||||
|
||||
|
||||
def verbalize_cardinal(value_string: str) -> str:
|
||||
if not value_string:
|
||||
return ''
|
||||
return ""
|
||||
|
||||
# 000 -> '零' , 0 -> '零'
|
||||
value_string = value_string.lstrip('0')
|
||||
value_string = value_string.lstrip("0")
|
||||
if len(value_string) == 0:
|
||||
return DIGITS['0']
|
||||
return DIGITS["0"]
|
||||
|
||||
result_symbols = _get_value(value_string)
|
||||
# verbalized number starting with '一十*' is abbreviated as `十*`
|
||||
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS[
|
||||
'1'] and result_symbols[1] == UNITS[1]:
|
||||
if len(result_symbols) >= 2 and result_symbols[0] == DIGITS["1"] and result_symbols[1] == UNITS[1]:
|
||||
result_symbols = result_symbols[1:]
|
||||
return ''.join(result_symbols)
|
||||
return "".join(result_symbols)
|
||||
|
||||
|
||||
def verbalize_digit(value_string: str, alt_one=False) -> str:
|
||||
result_symbols = [DIGITS[digit] for digit in value_string]
|
||||
result = ''.join(result_symbols)
|
||||
result = "".join(result_symbols)
|
||||
if alt_one:
|
||||
result = result.replace("一", "幺")
|
||||
return result
|
||||
|
||||
|
||||
def num2str(value_string: str) -> str:
|
||||
integer_decimal = value_string.split('.')
|
||||
integer_decimal = value_string.split(".")
|
||||
if len(integer_decimal) == 1:
|
||||
integer = integer_decimal[0]
|
||||
decimal = ''
|
||||
decimal = ""
|
||||
elif len(integer_decimal) == 2:
|
||||
integer, decimal = integer_decimal
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The value string: '${value_string}' has more than one point in it."
|
||||
)
|
||||
raise ValueError(f"The value string: '${value_string}' has more than one point in it.")
|
||||
|
||||
result = verbalize_cardinal(integer)
|
||||
|
||||
decimal = decimal.rstrip('0')
|
||||
decimal = decimal.rstrip("0")
|
||||
if decimal:
|
||||
# '.22' is verbalized as '零点二二'
|
||||
# '3.20' is verbalized as '三点二
|
||||
result = result if result else "零"
|
||||
result += '点' + verbalize_digit(decimal)
|
||||
result += "点" + verbalize_digit(decimal)
|
||||
return result
|
||||
|
||||
@@ -21,10 +21,8 @@ from .num import verbalize_digit
|
||||
# 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
|
||||
# 联通:130、131、132、156、155、186、185、176
|
||||
# 电信:133、153、189、180、181、177
|
||||
RE_MOBILE_PHONE = re.compile(
|
||||
r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
|
||||
RE_TELEPHONE = re.compile(
|
||||
r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
|
||||
RE_MOBILE_PHONE = re.compile(r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
|
||||
RE_TELEPHONE = re.compile(r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
|
||||
|
||||
# 全国统一的号码400开头
|
||||
RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
|
||||
@@ -32,14 +30,12 @@ RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
|
||||
|
||||
def phone2str(phone_string: str, mobile=True) -> str:
|
||||
if mobile:
|
||||
sp_parts = phone_string.strip('+').split()
|
||||
result = ','.join(
|
||||
[verbalize_digit(part, alt_one=True) for part in sp_parts])
|
||||
sp_parts = phone_string.strip("+").split()
|
||||
result = ",".join([verbalize_digit(part, alt_one=True) for part in sp_parts])
|
||||
return result
|
||||
else:
|
||||
sil_parts = phone_string.split('-')
|
||||
result = ','.join(
|
||||
[verbalize_digit(part, alt_one=True) for part in sil_parts])
|
||||
sil_parts = phone_string.split("-")
|
||||
result = ",".join([verbalize_digit(part, alt_one=True) for part in sil_parts])
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ from .num import num2str
|
||||
|
||||
# 温度表达式,温度会影响负号的读法
|
||||
# -3°C 零下三度
|
||||
RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)')
|
||||
RE_TEMPERATURE = re.compile(r"(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)")
|
||||
measure_dict = {
|
||||
"cm2": "平方厘米",
|
||||
"cm²": "平方厘米",
|
||||
@@ -35,7 +35,7 @@ measure_dict = {
|
||||
"ml": "毫升",
|
||||
"m": "米",
|
||||
"mm": "毫米",
|
||||
"s": "秒"
|
||||
"s": "秒",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -56,9 +56,9 @@ from .quantifier import replace_measure
|
||||
from .quantifier import replace_temperature
|
||||
|
||||
|
||||
class TextNormalizer():
|
||||
class TextNormalizer:
|
||||
def __init__(self):
|
||||
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
|
||||
self.SENTENCE_SPLITOR = re.compile(r"([:、,;。?!,;?!][”’]?)")
|
||||
|
||||
def _split(self, text: str, lang="zh") -> List[str]:
|
||||
"""Split long text into sentences with sentence-splitting punctuations.
|
||||
@@ -71,66 +71,64 @@ class TextNormalizer():
|
||||
if lang == "zh":
|
||||
text = text.replace(" ", "")
|
||||
# 过滤掉特殊字符
|
||||
text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text)
|
||||
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
||||
text = re.sub(r"[——《》【】<>{}()()#&@“”^_|\\]", "", text)
|
||||
text = self.SENTENCE_SPLITOR.sub(r"\1\n", text)
|
||||
text = text.strip()
|
||||
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
||||
sentences = [sentence.strip() for sentence in re.split(r"\n+", text)]
|
||||
return sentences
|
||||
|
||||
def _post_replace(self, sentence: str) -> str:
|
||||
sentence = sentence.replace('/', '每')
|
||||
sentence = sentence.replace("/", "每")
|
||||
# sentence = sentence.replace('~', '至')
|
||||
# sentence = sentence.replace('~', '至')
|
||||
sentence = sentence.replace('①', '一')
|
||||
sentence = sentence.replace('②', '二')
|
||||
sentence = sentence.replace('③', '三')
|
||||
sentence = sentence.replace('④', '四')
|
||||
sentence = sentence.replace('⑤', '五')
|
||||
sentence = sentence.replace('⑥', '六')
|
||||
sentence = sentence.replace('⑦', '七')
|
||||
sentence = sentence.replace('⑧', '八')
|
||||
sentence = sentence.replace('⑨', '九')
|
||||
sentence = sentence.replace('⑩', '十')
|
||||
sentence = sentence.replace('α', '阿尔法')
|
||||
sentence = sentence.replace('β', '贝塔')
|
||||
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
|
||||
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
|
||||
sentence = sentence.replace('ε', '艾普西龙')
|
||||
sentence = sentence.replace('ζ', '捷塔')
|
||||
sentence = sentence.replace('η', '依塔')
|
||||
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
|
||||
sentence = sentence.replace('ι', '艾欧塔')
|
||||
sentence = sentence.replace('κ', '喀帕')
|
||||
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
|
||||
sentence = sentence.replace('μ', '缪')
|
||||
sentence = sentence.replace('ν', '拗')
|
||||
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
|
||||
sentence = sentence.replace('ο', '欧米克伦')
|
||||
sentence = sentence.replace('π', '派').replace('Π', '派')
|
||||
sentence = sentence.replace('ρ', '肉')
|
||||
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
|
||||
'σ', '西格玛')
|
||||
sentence = sentence.replace('τ', '套')
|
||||
sentence = sentence.replace('υ', '宇普西龙')
|
||||
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
|
||||
sentence = sentence.replace('χ', '器')
|
||||
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
|
||||
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
|
||||
sentence = sentence.replace("①", "一")
|
||||
sentence = sentence.replace("②", "二")
|
||||
sentence = sentence.replace("③", "三")
|
||||
sentence = sentence.replace("④", "四")
|
||||
sentence = sentence.replace("⑤", "五")
|
||||
sentence = sentence.replace("⑥", "六")
|
||||
sentence = sentence.replace("⑦", "七")
|
||||
sentence = sentence.replace("⑧", "八")
|
||||
sentence = sentence.replace("⑨", "九")
|
||||
sentence = sentence.replace("⑩", "十")
|
||||
sentence = sentence.replace("α", "阿尔法")
|
||||
sentence = sentence.replace("β", "贝塔")
|
||||
sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛")
|
||||
sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔")
|
||||
sentence = sentence.replace("ε", "艾普西龙")
|
||||
sentence = sentence.replace("ζ", "捷塔")
|
||||
sentence = sentence.replace("η", "依塔")
|
||||
sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔")
|
||||
sentence = sentence.replace("ι", "艾欧塔")
|
||||
sentence = sentence.replace("κ", "喀帕")
|
||||
sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达")
|
||||
sentence = sentence.replace("μ", "缪")
|
||||
sentence = sentence.replace("ν", "拗")
|
||||
sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西")
|
||||
sentence = sentence.replace("ο", "欧米克伦")
|
||||
sentence = sentence.replace("π", "派").replace("Π", "派")
|
||||
sentence = sentence.replace("ρ", "肉")
|
||||
sentence = sentence.replace("ς", "西格玛").replace("Σ", "西格玛").replace("σ", "西格玛")
|
||||
sentence = sentence.replace("τ", "套")
|
||||
sentence = sentence.replace("υ", "宇普西龙")
|
||||
sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾")
|
||||
sentence = sentence.replace("χ", "器")
|
||||
sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛")
|
||||
sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽")
|
||||
# 兜底数学运算,顺便兼容懒人用语
|
||||
sentence = sentence.replace('+', '加')
|
||||
sentence = sentence.replace('-', '减')
|
||||
sentence = sentence.replace('×', '乘')
|
||||
sentence = sentence.replace('÷', '除')
|
||||
sentence = sentence.replace('=', '等')
|
||||
sentence = sentence.replace("+", "加")
|
||||
sentence = sentence.replace("-", "减")
|
||||
sentence = sentence.replace("×", "乘")
|
||||
sentence = sentence.replace("÷", "除")
|
||||
sentence = sentence.replace("=", "等")
|
||||
# re filter special characters, have one more character "-" than line 68
|
||||
sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence)
|
||||
sentence = re.sub(r"[-——《》【】<=>{}()()#&@“”^_|\\]", "", sentence)
|
||||
return sentence
|
||||
|
||||
def normalize_sentence(self, sentence: str) -> str:
|
||||
# basic character conversions
|
||||
sentence = tranditional_to_simplified(sentence)
|
||||
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
|
||||
F2H_DIGITS).translate(F2H_SPACE)
|
||||
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(F2H_DIGITS).translate(F2H_SPACE)
|
||||
|
||||
# number related NSW verbalization
|
||||
sentence = RE_DATE.sub(replace_date, sentence)
|
||||
@@ -161,8 +159,7 @@ class TextNormalizer():
|
||||
|
||||
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
|
||||
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
|
||||
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
|
||||
sentence)
|
||||
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
|
||||
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
|
||||
sentence = RE_NUMBER.sub(replace_number, sentence)
|
||||
sentence = self._post_replace(sentence)
|
||||
|
||||
Reference in New Issue
Block a user