Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix

* ruff format --line-length 120 --target-version py39

* Change the link for G2PW Model

* update pytorch version and colab
This commit is contained in:
XXXXRT666
2025-04-07 09:42:47 +01:00
committed by GitHub
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions

View File

@@ -9,39 +9,43 @@ import importlib
import os
# 防止win下无法读取模型
if os.name == 'nt':
if os.name == "nt":
class win_G2p(G2p):
def check_mecab(self):
super().check_mecab()
spam_spec = importlib.util.find_spec("eunjeon")
non_found = spam_spec is None
if non_found:
print(f'you have to install eunjeon. install it...')
print("you have to install eunjeon. install it...")
else:
installpath = spam_spec.submodule_search_locations[0]
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
import sys
from eunjeon import Mecab as _Mecab
class Mecab(_Mecab):
def get_dicpath(installpath):
if not (re.match(r'^[A-Za-z0-9_/\\:.\-]*$', installpath)):
if not (re.match(r"^[A-Za-z0-9_/\\:.\-]*$", installpath)):
import shutil
python_dir = os.getcwd()
if (installpath[:len(python_dir)].upper() == python_dir.upper()):
dicpath = os.path.join(os.path.relpath(installpath,python_dir),'data','mecabrc')
else:
if not os.path.exists('TEMP'):
os.mkdir('TEMP')
if not os.path.exists(os.path.join('TEMP', 'ko')):
os.mkdir(os.path.join('TEMP', 'ko'))
if os.path.exists(os.path.join('TEMP', 'ko', 'ko_dict')):
shutil.rmtree(os.path.join('TEMP', 'ko', 'ko_dict'))
shutil.copytree(os.path.join(installpath, 'data'), os.path.join('TEMP', 'ko', 'ko_dict'))
dicpath = os.path.join('TEMP', 'ko', 'ko_dict', 'mecabrc')
python_dir = os.getcwd()
if installpath[: len(python_dir)].upper() == python_dir.upper():
dicpath = os.path.join(os.path.relpath(installpath, python_dir), "data", "mecabrc")
else:
if not os.path.exists("TEMP"):
os.mkdir("TEMP")
if not os.path.exists(os.path.join("TEMP", "ko")):
os.mkdir(os.path.join("TEMP", "ko"))
if os.path.exists(os.path.join("TEMP", "ko", "ko_dict")):
shutil.rmtree(os.path.join("TEMP", "ko", "ko_dict"))
shutil.copytree(
os.path.join(installpath, "data"), os.path.join("TEMP", "ko", "ko_dict")
)
dicpath = os.path.join("TEMP", "ko", "ko_dict", "mecabrc")
else:
dicpath=os.path.abspath(os.path.join(installpath, 'data/mecabrc'))
dicpath = os.path.abspath(os.path.join(installpath, "data/mecabrc"))
return dicpath
def __init__(self, dicpath=get_dicpath(installpath)):
@@ -52,97 +56,108 @@ if os.name == 'nt':
G2p = win_G2p
from text.symbols2 import symbols
from text.symbols2 import symbols
# This is a list of Korean classifiers preceded by pure Korean numerals.
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
_korean_classifiers = (
"군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통"
)
# List of (hangul, hangul divided) pairs:
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
# ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
# ('ㄵ', 'ㄴㅈ'),
# ('', 'ㄴㅎ'),
# ('', 'ㄹㄱ'),
# ('', 'ㄹㅁ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㅂㅅ'),
('', 'ㅗㅏ'),
('', 'ㅗㅐ'),
('', 'ㅗㅣ'),
('', 'ㅜㅓ'),
('', 'ㅜㅔ'),
('', 'ㅜㅣ'),
('', 'ㅡㅣ'),
('', 'ㅣㅏ'),
('', 'ㅣㅐ'),
('', 'ㅣㅓ'),
('', 'ㅣㅔ'),
('', 'ㅣㅗ'),
('', 'ㅣㅜ')
]]
_hangul_divided = [
(re.compile("%s" % x[0]), x[1])
for x in [
# ('', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
# ('', 'ㄴㅈ'),
# ('', 'ㄴㅎ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹ'),
# ('', 'ㄹㅍ'),
# ('ㅀ', 'ㄹㅎ'),
# ('ㅄ', 'ㅂㅅ'),
("", "ㅗㅏ"),
("", "ㅗㅐ"),
("", "ㅗㅣ"),
("", "ㅜㅓ"),
("", "ㅜㅔ"),
("", "ㅜㅣ"),
("", "ㅡㅣ"),
("", "ㅣㅏ"),
("", "ㅣㅐ"),
("", "ㅣㅓ"),
("", "ㅣㅔ"),
("", "ㅣㅗ"),
("", "ㅣㅜ"),
]
]
# List of (Latin alphabet, hangul) pairs:
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('a', '에이'),
('b', ''),
('c', ''),
('d', ''),
('e', ''),
('f', '에프'),
('g', ''),
('h', '에이치'),
('i', '아이'),
('j', '제이'),
('k', '케이'),
('l', ''),
('m', ''),
('n', ''),
('o', ''),
('p', ''),
('q', ''),
('r', '아르'),
('s', '에스'),
('t', ''),
('u', ''),
('v', '브이'),
('w', '더블유'),
('x', '엑스'),
('y', '와이'),
('z', '제트')
]]
_latin_to_hangul = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("a", "에이"),
("b", ""),
("c", ""),
("d", ""),
("e", ""),
("f", "에프"),
("g", ""),
("h", "에이치"),
("i", "아이"),
("j", "제이"),
("k", "케이"),
("l", ""),
("m", ""),
("n", ""),
("o", ""),
("p", ""),
("q", ""),
("r", "아르"),
("s", "에스"),
("t", ""),
("u", ""),
("v", "브이"),
("w", "더블유"),
("x", "엑스"),
("y", "와이"),
("z", "제트"),
]
]
# List of (ipa, lazy ipa) pairs:
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('t͡ɕ','ʧ'),
('d͡ʑ','ʥ'),
('ɲ','n^'),
('ɕ','ʃ'),
('ʷ','w'),
('ɭ','l`'),
('ʎ','ɾ'),
('ɣ','ŋ'),
('ɰ','ɯ'),
('ʝ','j'),
('ʌ','ə'),
('ɡ','g'),
('\u031a','#'),
('\u0348','='),
('\u031e',''),
('\u0320',''),
('\u0339','')
]]
_ipa_to_lazy_ipa = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("t͡ɕ", "ʧ"),
("d͡ʑ", "ʥ"),
("ɲ", "n^"),
("ɕ", "ʃ"),
("ʷ", "w"),
("ɭ", "l`"),
("ʎ", "ɾ"),
("ɣ", "ŋ"),
("ɰ", "ɯ"),
("ʝ", "j"),
("ʌ", "ə"),
("ɡ", "g"),
("\u031a", "#"),
("\u0348", "="),
("\u031e", ""),
("\u0320", ""),
("\u0339", ""),
]
]
def fix_g2pk2_error(text):
new_text = ""
i = 0
while i < len(text) - 4:
if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == '':
new_text += text[i:i+3] + ' ' + ''
if (text[i : i + 3] == "ㅇㅡㄹ" or text[i : i + 3] == "ㄹㅡㄹ") and text[i + 3] == " " and text[i + 4] == "":
new_text += text[i : i + 3] + " " + ""
i += 5
else:
new_text += text[i]
@@ -166,20 +181,20 @@ def divide_hangul(text):
def hangul_number(num, sino=True):
'''Reference https://github.com/Kyubyong/g2pK'''
num = re.sub(',', '', num)
"""Reference https://github.com/Kyubyong/g2pK"""
num = re.sub(",", "", num)
if num == '0':
return ''
if not sino and num == '20':
return '스무'
if num == "0":
return ""
if not sino and num == "20":
return "스무"
digits = '123456789'
names = '일이삼사오육칠팔구'
digits = "123456789"
names = "일이삼사오육칠팔구"
digit2name = {d: n for d, n in zip(digits, names)}
modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
modifiers = "한 두 세 네 다섯 여섯 일곱 여덟 아홉"
decimals = "열 스물 서른 마흔 쉰 예순 일흔 여든 아흔"
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
@@ -188,75 +203,75 @@ def hangul_number(num, sino=True):
i = len(num) - i - 1
if sino:
if i == 0:
name = digit2name.get(digit, '')
name = digit2name.get(digit, "")
elif i == 1:
name = digit2name.get(digit, '') + ''
name = name.replace('일십', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일십", "")
else:
if i == 0:
name = digit2mod.get(digit, '')
name = digit2mod.get(digit, "")
elif i == 1:
name = digit2dec.get(digit, '')
if digit == '0':
name = digit2dec.get(digit, "")
if digit == "0":
if i % 4 == 0:
last_three = spelledout[-min(3, len(spelledout)):]
if ''.join(last_three) == '':
spelledout.append('')
last_three = spelledout[-min(3, len(spelledout)) :]
if "".join(last_three) == "":
spelledout.append("")
continue
else:
spelledout.append('')
spelledout.append("")
continue
if i == 2:
name = digit2name.get(digit, '') + ''
name = name.replace('일백', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일백", "")
elif i == 3:
name = digit2name.get(digit, '') + ''
name = name.replace('일천', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일천", "")
elif i == 4:
name = digit2name.get(digit, '') + ''
name = name.replace('일만', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일만", "")
elif i == 5:
name = digit2name.get(digit, '') + ''
name = name.replace('일십', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일십", "")
elif i == 6:
name = digit2name.get(digit, '') + ''
name = name.replace('일백', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일백", "")
elif i == 7:
name = digit2name.get(digit, '') + ''
name = name.replace('일천', '')
name = digit2name.get(digit, "") + ""
name = name.replace("일천", "")
elif i == 8:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 9:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 10:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 11:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 12:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 13:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 14:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
elif i == 15:
name = digit2name.get(digit, '') + ''
name = digit2name.get(digit, "") + ""
spelledout.append(name)
return ''.join(elem for elem in spelledout)
return "".join(elem for elem in spelledout)
def number_to_hangul(text):
'''Reference https://github.com/Kyubyong/g2pK'''
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
"""Reference https://github.com/Kyubyong/g2pK"""
tokens = set(re.findall(r"(\d[\d,]*)([\uac00-\ud71f]+)", text))
for token in tokens:
num, classifier = token
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
spelledout = hangul_number(num, sino=False)
else:
spelledout = hangul_number(num, sino=True)
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
text = text.replace(f"{num}{classifier}", f"{spelledout}{classifier}")
# digit by digit for remaining digits
digits = '0123456789'
names = '영일이삼사오육칠팔구'
digits = "0123456789"
names = "영일이삼사오육칠팔구"
for d, n in zip(digits, names):
text = text.replace(d, n)
return text
@@ -265,19 +280,23 @@ def number_to_hangul(text):
def korean_to_lazy_ipa(text):
text = latin_to_hangul(text)
text = number_to_hangul(text)
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
text = re.sub("[\uac00-\ud7af]+", lambda x: ko_pron.romanise(x.group(0), "ipa").split("] ~ [")[0], text)
for regex, replacement in _ipa_to_lazy_ipa:
text = re.sub(regex, replacement, text)
return text
_g2p=G2p()
_g2p = G2p()
def korean_to_ipa(text):
text = latin_to_hangul(text)
text = number_to_hangul(text)
text = _g2p(text)
text = fix_g2pk2_error(text)
text = korean_to_lazy_ipa(text)
return text.replace('ʧ','').replace('ʥ','')
return text.replace("ʧ", "").replace("ʥ", "")
def post_replace_ph(ph):
rep_map = {
@@ -301,12 +320,13 @@ def post_replace_ph(ph):
ph = ""
return ph
def g2p(text):
text = latin_to_hangul(text)
text = _g2p(text)
text = divide_hangul(text)
text = fix_g2pk2_error(text)
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
text = re.sub(r"([\u3131-\u3163])$", r"\1.", text)
# text = "".join([post_replace_ph(i) for i in text])
text = [post_replace_ph(i) for i in text]
return text
@@ -314,4 +334,4 @@ def g2p(text):
if __name__ == "__main__":
text = "안녕하세요"
print(g2p(text))
print(g2p(text))