Better normlization (#1351)

This commit is contained in:
KamioRinn
2024-07-27 16:03:43 +08:00
committed by GitHub
parent f042030cca
commit e851ae34c9
6 changed files with 93 additions and 18 deletions

View File

@@ -107,8 +107,11 @@ def replace_default_num(match):
# 加减乘除
# RE_ASMD = re.compile(
# r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
RE_ASMD = re.compile(
r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
r'((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))')
asmd_map = {
'+': '',
'-': '',
@@ -117,7 +120,6 @@ asmd_map = {
'=': '等于'
}
def replace_asmd(match) -> str:
"""
Args:
@@ -129,6 +131,39 @@ def replace_asmd(match) -> str:
return result
# 次方专项
RE_POWER = re.compile(r'[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+')
power_map = {
'': '0',
'¹': '1',
'²': '2',
'³': '3',
'': '4',
'': '5',
'': '6',
'': '7',
'': '8',
'': '9',
'ˣ': 'x',
'ʸ': 'y',
'': 'n'
}
def replace_power(match) -> str:
"""
Args:
match (re.Match)
Returns:
str
"""
power_num = ""
for m in match.group(0):
power_num += power_map[m]
result = "" + power_num + "次方"
return result
# 数字表达式
# 纯小数
RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))')

View File

@@ -35,6 +35,7 @@ from .num import RE_POSITIVE_QUANTIFIERS
from .num import RE_RANGE
from .num import RE_TO_RANGE
from .num import RE_ASMD
from .num import RE_POWER
from .num import replace_default_num
from .num import replace_frac
from .num import replace_negative_num
@@ -44,6 +45,7 @@ from .num import replace_positive_quantifier
from .num import replace_range
from .num import replace_to_range
from .num import replace_asmd
from .num import replace_power
from .phonecode import RE_MOBILE_PHONE
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
from .phonecode import RE_TELEPHONE
@@ -114,6 +116,12 @@ class TextNormalizer():
sentence = sentence.replace('χ', '')
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
# 兜底数学运算,顺便兼容懒人用语
sentence = sentence.replace('+', '')
sentence = sentence.replace('-', '')
sentence = sentence.replace('×', '')
sentence = sentence.replace('÷', '')
sentence = sentence.replace('=', '')
# re filter special characters, have one more character "-" than line 68
sentence = re.sub(r'[-——《》【】<=>{}()#&@“”^_|\\]', '', sentence)
return sentence
@@ -136,6 +144,12 @@ class TextNormalizer():
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
sentence = replace_measure(sentence)
# 处理数学运算
while RE_ASMD.search(sentence):
sentence = RE_ASMD.sub(replace_asmd, sentence)
sentence = RE_POWER.sub(replace_power, sentence)
sentence = RE_FRAC.sub(replace_frac, sentence)
sentence = RE_PERCENTAGE.sub(replace_percentage, sentence)
sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence)
@@ -145,10 +159,6 @@ class TextNormalizer():
sentence = RE_RANGE.sub(replace_range, sentence)
# 处理加减乘除
while RE_ASMD.search(sentence):
sentence = RE_ASMD.sub(replace_asmd, sentence)
sentence = RE_INTEGER.sub(replace_negative_num, sentence)
sentence = RE_DECIMAL_NUM.sub(replace_number, sentence)
sentence = RE_POSITIVE_QUANTIFIERS.sub(replace_positive_quantifier,