{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "整体代码拆分" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tokenizer " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. 训练一个 tokenzier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from tokenizers import ByteLevelBPETokenizer\n", "\n", "dir_path = \"\"\n", "paths = os.listdir(dir_path)\n", "\n", "# 使用字节级的 BPE 分词器\n", "tokenizer = ByteLevelBPETokenizer()\n", "\n", "# 进行训练\n", "# vocab_size:词表大小\n", "# min_frequency:最小词频\n", "# special_tokens:特殊 token 列表\n", "tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", "])\n", "\n", "# 训练完成后手动保存\n", "tokenizer.save_model(\".\", \"esperberto\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 测试一下\n", "from tokenizers.implementations import ByteLevelBPETokenizer\n", "from tokenizers.processors import BertProcessing\n", "\n", "\n", "tokenizer = ByteLevelBPETokenizer(\n", " \"./models/EsperBERTo-small/vocab.json\",\n", " \"./models/EsperBERTo-small/merges.txt\",\n", ")\n", "tokenizer._tokenizer.post_processor = BertProcessing(\n", " (\"\", tokenizer.token_to_id(\"\")),\n", " (\"\", tokenizer.token_to_id(\"\")),\n", ")\n", "tokenizer.enable_truncation(max_length=512)\n", "\n", "print(\n", " tokenizer.encode(\"Mi estas Julien.\")\n", ")\n", "# Encoding(num_tokens=7, ...)\n", "# tokens: ['', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '']\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }