This commit is contained in:
Logan Zou
2025-04-10 17:54:58 +08:00
parent 945af52eff
commit ec7d0ef487
3 changed files with 713 additions and 0 deletions

View File

@@ -0,0 +1,91 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"整体代码拆分"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tokenizer "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. 训练一个 tokenzier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from tokenizers import ByteLevelBPETokenizer\n",
"\n",
"dir_path = \"\"\n",
"paths = os.listdir(dir_path)\n",
"\n",
"# 使用字节级的 BPE 分词器\n",
"tokenizer = ByteLevelBPETokenizer()\n",
"\n",
"# 进行训练\n",
"# vocab_size词表大小\n",
"# min_frequency最小词频\n",
"# special_tokens特殊 token 列表\n",
"tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n",
" \"<s>\",\n",
" \"<pad>\",\n",
" \"</s>\",\n",
" \"<unk>\",\n",
" \"<mask>\",\n",
"])\n",
"\n",
"# 训练完成后手动保存\n",
"tokenizer.save_model(\".\", \"esperberto\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 测试一下\n",
"from tokenizers.implementations import ByteLevelBPETokenizer\n",
"from tokenizers.processors import BertProcessing\n",
"\n",
"\n",
"tokenizer = ByteLevelBPETokenizer(\n",
" \"./models/EsperBERTo-small/vocab.json\",\n",
" \"./models/EsperBERTo-small/merges.txt\",\n",
")\n",
"tokenizer._tokenizer.post_processor = BertProcessing(\n",
" (\"</s>\", tokenizer.token_to_id(\"</s>\")),\n",
" (\"<s>\", tokenizer.token_to_id(\"<s>\")),\n",
")\n",
"tokenizer.enable_truncation(max_length=512)\n",
"\n",
"print(\n",
" tokenizer.encode(\"Mi estas Julien.\")\n",
")\n",
"# Encoding(num_tokens=7, ...)\n",
"# tokens: ['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}