Add en_normalization and fix LangSegmenter (#2062)

This commit is contained in:
KamioRinn
2025-02-17 18:41:30 +08:00
committed by GitHub
parent c70daefea2
commit c17dd642c7
5 changed files with 301 additions and 41 deletions

View File

@@ -135,7 +135,7 @@ def cut3(inp):
@register_method("cut4")
def cut4(inp):
inp = inp.strip("\n")
opts = ["%s" % item for item in inp.strip(".").split(".")]
opts = re.split(r'(?<!\d)\.(?!\d)', inp.strip("."))
opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts)