Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)
* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
This commit is contained in:
@@ -15,6 +15,7 @@
|
||||
Credits
|
||||
This code is modified from https://github.com/GitYCC/g2pW
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
@@ -24,14 +25,14 @@ def wordize_and_map(text: str):
|
||||
index_map_from_text_to_word = []
|
||||
index_map_from_word_to_text = []
|
||||
while len(text) > 0:
|
||||
match_space = re.match(r'^ +', text)
|
||||
match_space = re.match(r"^ +", text)
|
||||
if match_space:
|
||||
space_str = match_space.group(0)
|
||||
index_map_from_text_to_word += [None] * len(space_str)
|
||||
text = text[len(space_str):]
|
||||
text = text[len(space_str) :]
|
||||
continue
|
||||
|
||||
match_en = re.match(r'^[a-zA-Z0-9]+', text)
|
||||
match_en = re.match(r"^[a-zA-Z0-9]+", text)
|
||||
if match_en:
|
||||
en_word = match_en.group(0)
|
||||
|
||||
@@ -42,7 +43,7 @@ def wordize_and_map(text: str):
|
||||
index_map_from_text_to_word += [len(words)] * len(en_word)
|
||||
|
||||
words.append(en_word)
|
||||
text = text[len(en_word):]
|
||||
text = text[len(en_word) :]
|
||||
else:
|
||||
word_start_pos = len(index_map_from_text_to_word)
|
||||
word_end_pos = word_start_pos + 1
|
||||
@@ -63,15 +64,14 @@ def tokenize_and_map(tokenizer, text: str):
|
||||
for word, (word_start, word_end) in zip(words, word2text):
|
||||
word_tokens = tokenizer.tokenize(word)
|
||||
|
||||
if len(word_tokens) == 0 or word_tokens == ['[UNK]']:
|
||||
if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
|
||||
index_map_from_token_to_text.append((word_start, word_end))
|
||||
tokens.append('[UNK]')
|
||||
tokens.append("[UNK]")
|
||||
else:
|
||||
current_word_start = word_start
|
||||
for word_token in word_tokens:
|
||||
word_token_len = len(re.sub(r'^##', '', word_token))
|
||||
index_map_from_token_to_text.append(
|
||||
(current_word_start, current_word_start + word_token_len))
|
||||
word_token_len = len(re.sub(r"^##", "", word_token))
|
||||
index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len))
|
||||
current_word_start = current_word_start + word_token_len
|
||||
tokens.append(word_token)
|
||||
|
||||
@@ -85,53 +85,51 @@ def tokenize_and_map(tokenizer, text: str):
|
||||
|
||||
def _load_config(config_path: os.PathLike):
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location('__init__', config_path)
|
||||
|
||||
spec = importlib.util.spec_from_file_location("__init__", config_path)
|
||||
config = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(config)
|
||||
return config
|
||||
|
||||
|
||||
default_config_dict = {
|
||||
'manual_seed': 1313,
|
||||
'model_source': 'bert-base-chinese',
|
||||
'window_size': 32,
|
||||
'num_workers': 2,
|
||||
'use_mask': True,
|
||||
'use_char_phoneme': False,
|
||||
'use_conditional': True,
|
||||
'param_conditional': {
|
||||
'affect_location': 'softmax',
|
||||
'bias': True,
|
||||
'char-linear': True,
|
||||
'pos-linear': False,
|
||||
'char+pos-second': True,
|
||||
'char+pos-second_lowrank': False,
|
||||
'lowrank_size': 0,
|
||||
'char+pos-second_fm': False,
|
||||
'fm_size': 0,
|
||||
'fix_mode': None,
|
||||
'count_json': 'train.count.json'
|
||||
"manual_seed": 1313,
|
||||
"model_source": "bert-base-chinese",
|
||||
"window_size": 32,
|
||||
"num_workers": 2,
|
||||
"use_mask": True,
|
||||
"use_char_phoneme": False,
|
||||
"use_conditional": True,
|
||||
"param_conditional": {
|
||||
"affect_location": "softmax",
|
||||
"bias": True,
|
||||
"char-linear": True,
|
||||
"pos-linear": False,
|
||||
"char+pos-second": True,
|
||||
"char+pos-second_lowrank": False,
|
||||
"lowrank_size": 0,
|
||||
"char+pos-second_fm": False,
|
||||
"fm_size": 0,
|
||||
"fix_mode": None,
|
||||
"count_json": "train.count.json",
|
||||
},
|
||||
'lr': 5e-5,
|
||||
'val_interval': 200,
|
||||
'num_iter': 10000,
|
||||
'use_focal': False,
|
||||
'param_focal': {
|
||||
'alpha': 0.0,
|
||||
'gamma': 0.7
|
||||
"lr": 5e-5,
|
||||
"val_interval": 200,
|
||||
"num_iter": 10000,
|
||||
"use_focal": False,
|
||||
"param_focal": {"alpha": 0.0, "gamma": 0.7},
|
||||
"use_pos": True,
|
||||
"param_pos ": {
|
||||
"weight": 0.1,
|
||||
"pos_joint_training": True,
|
||||
"train_pos_path": "train.pos",
|
||||
"valid_pos_path": "dev.pos",
|
||||
"test_pos_path": "test.pos",
|
||||
},
|
||||
'use_pos': True,
|
||||
'param_pos ': {
|
||||
'weight': 0.1,
|
||||
'pos_joint_training': True,
|
||||
'train_pos_path': 'train.pos',
|
||||
'valid_pos_path': 'dev.pos',
|
||||
'test_pos_path': 'test.pos'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def load_config(config_path: os.PathLike, use_default: bool=False):
|
||||
def load_config(config_path: os.PathLike, use_default: bool = False):
|
||||
config = _load_config(config_path)
|
||||
if use_default:
|
||||
for attr, val in default_config_dict.items():
|
||||
|
||||
Reference in New Issue
Block a user