Files
happy-llm/docs/chapter6/code/whole.ipynb
Logan Zou ec7d0ef487 init ch6
2025-04-10 17:54:58 +08:00

92 lines
2.1 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"整体代码拆分"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tokenizer "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. 训练一个 tokenzier"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from tokenizers import ByteLevelBPETokenizer\n",
"\n",
"dir_path = \"\"\n",
"paths = os.listdir(dir_path)\n",
"\n",
"# 使用字节级的 BPE 分词器\n",
"tokenizer = ByteLevelBPETokenizer()\n",
"\n",
"# 进行训练\n",
"# vocab_size词表大小\n",
"# min_frequency最小词频\n",
"# special_tokens特殊 token 列表\n",
"tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n",
" \"<s>\",\n",
" \"<pad>\",\n",
" \"</s>\",\n",
" \"<unk>\",\n",
" \"<mask>\",\n",
"])\n",
"\n",
"# 训练完成后手动保存\n",
"tokenizer.save_model(\".\", \"esperberto\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 测试一下\n",
"from tokenizers.implementations import ByteLevelBPETokenizer\n",
"from tokenizers.processors import BertProcessing\n",
"\n",
"\n",
"tokenizer = ByteLevelBPETokenizer(\n",
" \"./models/EsperBERTo-small/vocab.json\",\n",
" \"./models/EsperBERTo-small/merges.txt\",\n",
")\n",
"tokenizer._tokenizer.post_processor = BertProcessing(\n",
" (\"</s>\", tokenizer.token_to_id(\"</s>\")),\n",
" (\"<s>\", tokenizer.token_to_id(\"<s>\")),\n",
")\n",
"tokenizer.enable_truncation(max_length=512)\n",
"\n",
"print(\n",
" tokenizer.encode(\"Mi estas Julien.\")\n",
")\n",
"# Encoding(num_tokens=7, ...)\n",
"# tokens: ['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}