Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)
* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
This commit is contained in:
@@ -56,9 +56,9 @@ from .quantifier import replace_measure
|
||||
from .quantifier import replace_temperature
|
||||
|
||||
|
||||
class TextNormalizer():
|
||||
class TextNormalizer:
|
||||
def __init__(self):
|
||||
self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)')
|
||||
self.SENTENCE_SPLITOR = re.compile(r"([:、,;。?!,;?!][”’]?)")
|
||||
|
||||
def _split(self, text: str, lang="zh") -> List[str]:
|
||||
"""Split long text into sentences with sentence-splitting punctuations.
|
||||
@@ -71,66 +71,64 @@ class TextNormalizer():
|
||||
if lang == "zh":
|
||||
text = text.replace(" ", "")
|
||||
# 过滤掉特殊字符
|
||||
text = re.sub(r'[——《》【】<>{}()()#&@“”^_|\\]', '', text)
|
||||
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
||||
text = re.sub(r"[——《》【】<>{}()()#&@“”^_|\\]", "", text)
|
||||
text = self.SENTENCE_SPLITOR.sub(r"\1\n", text)
|
||||
text = text.strip()
|
||||
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
||||
sentences = [sentence.strip() for sentence in re.split(r"\n+", text)]
|
||||
return sentences
|
||||
|
||||
def _post_replace(self, sentence: str) -> str:
|
||||
sentence = sentence.replace('/', '每')
|
||||
sentence = sentence.replace("/", "每")
|
||||
# sentence = sentence.replace('~', '至')
|
||||
# sentence = sentence.replace('~', '至')
|
||||
sentence = sentence.replace('①', '一')
|
||||
sentence = sentence.replace('②', '二')
|
||||
sentence = sentence.replace('③', '三')
|
||||
sentence = sentence.replace('④', '四')
|
||||
sentence = sentence.replace('⑤', '五')
|
||||
sentence = sentence.replace('⑥', '六')
|
||||
sentence = sentence.replace('⑦', '七')
|
||||
sentence = sentence.replace('⑧', '八')
|
||||
sentence = sentence.replace('⑨', '九')
|
||||
sentence = sentence.replace('⑩', '十')
|
||||
sentence = sentence.replace('α', '阿尔法')
|
||||
sentence = sentence.replace('β', '贝塔')
|
||||
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
|
||||
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
|
||||
sentence = sentence.replace('ε', '艾普西龙')
|
||||
sentence = sentence.replace('ζ', '捷塔')
|
||||
sentence = sentence.replace('η', '依塔')
|
||||
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
|
||||
sentence = sentence.replace('ι', '艾欧塔')
|
||||
sentence = sentence.replace('κ', '喀帕')
|
||||
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
|
||||
sentence = sentence.replace('μ', '缪')
|
||||
sentence = sentence.replace('ν', '拗')
|
||||
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
|
||||
sentence = sentence.replace('ο', '欧米克伦')
|
||||
sentence = sentence.replace('π', '派').replace('Π', '派')
|
||||
sentence = sentence.replace('ρ', '肉')
|
||||
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
|
||||
'σ', '西格玛')
|
||||
sentence = sentence.replace('τ', '套')
|
||||
sentence = sentence.replace('υ', '宇普西龙')
|
||||
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
|
||||
sentence = sentence.replace('χ', '器')
|
||||
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
|
||||
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
|
||||
sentence = sentence.replace("①", "一")
|
||||
sentence = sentence.replace("②", "二")
|
||||
sentence = sentence.replace("③", "三")
|
||||
sentence = sentence.replace("④", "四")
|
||||
sentence = sentence.replace("⑤", "五")
|
||||
sentence = sentence.replace("⑥", "六")
|
||||
sentence = sentence.replace("⑦", "七")
|
||||
sentence = sentence.replace("⑧", "八")
|
||||
sentence = sentence.replace("⑨", "九")
|
||||
sentence = sentence.replace("⑩", "十")
|
||||
sentence = sentence.replace("α", "阿尔法")
|
||||
sentence = sentence.replace("β", "贝塔")
|
||||
sentence = sentence.replace("γ", "伽玛").replace("Γ", "伽玛")
|
||||
sentence = sentence.replace("δ", "德尔塔").replace("Δ", "德尔塔")
|
||||
sentence = sentence.replace("ε", "艾普西龙")
|
||||
sentence = sentence.replace("ζ", "捷塔")
|
||||
sentence = sentence.replace("η", "依塔")
|
||||
sentence = sentence.replace("θ", "西塔").replace("Θ", "西塔")
|
||||
sentence = sentence.replace("ι", "艾欧塔")
|
||||
sentence = sentence.replace("κ", "喀帕")
|
||||
sentence = sentence.replace("λ", "拉姆达").replace("Λ", "拉姆达")
|
||||
sentence = sentence.replace("μ", "缪")
|
||||
sentence = sentence.replace("ν", "拗")
|
||||
sentence = sentence.replace("ξ", "克西").replace("Ξ", "克西")
|
||||
sentence = sentence.replace("ο", "欧米克伦")
|
||||
sentence = sentence.replace("π", "派").replace("Π", "派")
|
||||
sentence = sentence.replace("ρ", "肉")
|
||||
sentence = sentence.replace("ς", "西格玛").replace("Σ", "西格玛").replace("σ", "西格玛")
|
||||
sentence = sentence.replace("τ", "套")
|
||||
sentence = sentence.replace("υ", "宇普西龙")
|
||||
sentence = sentence.replace("φ", "服艾").replace("Φ", "服艾")
|
||||
sentence = sentence.replace("χ", "器")
|
||||
sentence = sentence.replace("ψ", "普赛").replace("Ψ", "普赛")
|
||||
sentence = sentence.replace("ω", "欧米伽").replace("Ω", "欧米伽")
|
||||
# 兜底数学运算,顺便兼容懒人用语
|
||||
sentence = sentence.replace('+', '加')
|
||||
sentence = sentence.replace('-', '减')
|
||||
sentence = sentence.replace('×', '乘')
|
||||
sentence = sentence.replace('÷', '除')
|
||||
sentence = sentence.replace('=', '等')
|
||||
sentence = sentence.replace("+", "加")
|
||||
sentence = sentence.replace("-", "减")
|
||||
sentence = sentence.replace("×", "乘")
|
||||
sentence = sentence.replace("÷", "除")
|
||||
sentence = sentence.replace("=", "等")
|
||||
# re filter special characters, have one more character "-" than line 68
|
||||
sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|\\]', '', sentence)
|
||||
sentence = re.sub(r"[-——《》【】<=>{}()()#&@“”^_|\\]", "", sentence)
|
||||
return sentence
|
||||
|
||||
def normalize_sentence(self, sentence: str) -> str:
|
||||
# basic character conversions
|
||||
sentence = tranditional_to_simplified(sentence)
|
||||
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(
|
||||
F2H_DIGITS).translate(F2H_SPACE)
|
||||
sentence = sentence.translate(F2H_ASCII_LETTERS).translate(F2H_DIGITS).translate(F2H_SPACE)
|
||||
|
||||
# number related NSW verbalization
|
||||
sentence = RE_DATE.sub(replace_date, sentence)
|
||||
@@ -161,8 +159,7 @@ class TextNormalizer():
|
||||
|
||||
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
|
||||
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
|
||||
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,
|
||||
sentence)
|
||||
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier, sentence)
|
||||
sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence)
|
||||
sentence = RE_NUMBER.sub(replace_number, sentence)
|
||||
sentence = self._post_replace(sentence)
|
||||
|
||||
Reference in New Issue
Block a user