{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"整体代码拆分"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tokenizer "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. 训练一个 tokenzier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from tokenizers import ByteLevelBPETokenizer\n",
"\n",
"dir_path = \"\"\n",
"paths = os.listdir(dir_path)\n",
"\n",
"# 使用字节级的 BPE 分词器\n",
"tokenizer = ByteLevelBPETokenizer()\n",
"\n",
"# 进行训练\n",
"# vocab_size:词表大小\n",
"# min_frequency:最小词频\n",
"# special_tokens:特殊 token 列表\n",
"tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
"])\n",
"\n",
"# 训练完成后手动保存\n",
"tokenizer.save_model(\".\", \"esperberto\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 测试一下\n",
"from tokenizers.implementations import ByteLevelBPETokenizer\n",
"from tokenizers.processors import BertProcessing\n",
"\n",
"\n",
"tokenizer = ByteLevelBPETokenizer(\n",
" \"./models/EsperBERTo-small/vocab.json\",\n",
" \"./models/EsperBERTo-small/merges.txt\",\n",
")\n",
"tokenizer._tokenizer.post_processor = BertProcessing(\n",
" (\"\", tokenizer.token_to_id(\"\")),\n",
" (\"\", tokenizer.token_to_id(\"\")),\n",
")\n",
"tokenizer.enable_truncation(max_length=512)\n",
"\n",
"print(\n",
" tokenizer.encode(\"Mi estas Julien.\")\n",
")\n",
"# Encoding(num_tokens=7, ...)\n",
"# tokens: ['', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '']\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}