update ch05
This commit is contained in:
@@ -90,11 +90,19 @@ pip install tokenizers datasets transformers
|
||||
然后,导入所需的库。
|
||||
|
||||
```python
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.pre_tokenizers import Whitespace
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
from datasets import load_dataset
|
||||
import random
|
||||
import json
|
||||
import os
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerFast
|
||||
from tokenizers import (
|
||||
decoders,
|
||||
models,
|
||||
pre_tokenizers,
|
||||
trainers,
|
||||
Tokenizer,
|
||||
)
|
||||
from tokenizers.normalizers import NFKC
|
||||
from typing import Generator
|
||||
```
|
||||
|
||||
### Step 2: 加载训练数据
|
||||
@@ -129,92 +137,201 @@ path_list = ['text_data1.txt', 'text_data2.txt', 'text_data3.txt']
|
||||
text_data = load_text_from_files(path_list)
|
||||
```
|
||||
|
||||
### Step 3: 训练 BPE Tokenizer
|
||||
### Step 3: 创建配置文件
|
||||
|
||||
(1)初始化tokenizer和trainer。
|
||||
在训练 BPE Tokenizer 之前,我们需要创建一个完整的 `Tokenizer` 配置文件,包括 `tokenizer_config.json` 和 `special_tokens_map.json`。这些配置文件定义了 `Tokenizer` 的参数和特殊标记,用于训练和加载 `Tokenizer`。此处的`chat_template`我们与`Qwen2.5`模型保持一致。
|
||||
|
||||
```python
|
||||
tokenizer = Tokenizer(BPE())
|
||||
def create_tokenizer_config(save_dir: str) -> None:
|
||||
"""创建完整的tokenizer配置文件"""
|
||||
config = {
|
||||
"add_bos_token": False,
|
||||
"add_eos_token": False,
|
||||
"add_prefix_space": True,
|
||||
"bos_token": "<|im_start|>",
|
||||
"eos_token": "<|im_end|>",
|
||||
"pad_token": "<|im_end|>",
|
||||
"unk_token": "<unk>",
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"clean_up_tokenization_spaces": False,
|
||||
"tokenizer_class": "PreTrainedTokenizerFast",
|
||||
"chat_template": (
|
||||
"{% for message in messages %}"
|
||||
"{% if message['role'] == 'system' %}"
|
||||
"<|im_start|>system\n{{ message['content'] }}<|im_end|>\n"
|
||||
"{% elif message['role'] == 'user' %}"
|
||||
"<|im_start|>user\n{{ message['content'] }}<|im_end|>\n"
|
||||
"{% elif message['role'] == 'assistant' %}"
|
||||
"<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n"
|
||||
"{% endif %}"
|
||||
"{% endfor %}"
|
||||
"{% if add_generation_prompt %}"
|
||||
"{{ '<|im_start|>assistant\n' }}"
|
||||
"{% endif %}"
|
||||
)
|
||||
}
|
||||
|
||||
# 保存主配置文件
|
||||
with open(os.path.join(save_dir, "tokenizer_config.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(config, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# 创建special_tokens_map.json
|
||||
special_tokens_map = {
|
||||
"bos_token": "<|im_start|>",
|
||||
"eos_token": "<|im_end|>",
|
||||
"unk_token": "<unk>",
|
||||
"pad_token": "<|im_end|>",
|
||||
"additional_special_tokens": ["<s>", "</s>"]
|
||||
}
|
||||
with open(os.path.join(save_dir, "special_tokens_map.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(special_tokens_map, f, ensure_ascii=False, indent=4)
|
||||
```
|
||||
|
||||
(2)定义预处理器
|
||||
### Step 4: 训练 BPE Tokenizer
|
||||
|
||||
在训练 BPE Tokenizer 之前,我们需要定义一个训练函数,用于训练 Tokenizer 并保存训练好的 Tokenizer 文件。这里我们使用 `tokenizers` 库中的 `Tokenizer` 类来训练 BPE Tokenizer。
|
||||
|
||||
可以看到我们在训练 Tokenizer 时,配置了一些特殊的 token,如 `<unk>`、`<s>`、`</s>`、`<|im_start|>` 和 `<|im_end|>`。这些 token 用于标记未知词、句子的开始和结束,以及对话的开始和结束。这些特殊 token 可以帮助模型更好地理解文本数据,提高模型的泛化能力和效果。
|
||||
|
||||
```python
|
||||
def train_tokenizer(data_path: str, save_dir: str, vocab_size: int = 8192) -> None:
|
||||
"""训练并保存自定义tokenizer"""
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
```python
|
||||
tokenizer.pre_tokenizer = Whitespace() # 使用 Whitespace 预处理器
|
||||
# 初始化tokenizer
|
||||
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
|
||||
tokenizer.normalizer = NFKC() # 添加文本规范化
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
# 配置特殊token
|
||||
special_tokens = [
|
||||
"<unk>",
|
||||
"<s>",
|
||||
"</s>",
|
||||
"<|im_start|>",
|
||||
"<|im_end|>"
|
||||
]
|
||||
|
||||
# 配置训练器
|
||||
trainer = trainers.BpeTrainer(
|
||||
vocab_size=vocab_size,
|
||||
special_tokens=special_tokens,
|
||||
min_frequency=2, # 提高低频词过滤
|
||||
show_progress=True,
|
||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||
)
|
||||
|
||||
# 训练tokenizer
|
||||
print(f"Training tokenizer with data from {data_path}")
|
||||
texts = read_texts_from_jsonl(data_path)
|
||||
tokenizer.train_from_iterator(texts, trainer=trainer, length=os.path.getsize(data_path))
|
||||
|
||||
# 验证特殊token映射
|
||||
try:
|
||||
assert tokenizer.token_to_id("<unk>") == 0
|
||||
assert tokenizer.token_to_id("<s>") == 1
|
||||
assert tokenizer.token_to_id("</s>") == 2
|
||||
assert tokenizer.token_to_id("<|im_start|>") == 3
|
||||
assert tokenizer.token_to_id("<|im_end|>") == 4
|
||||
except AssertionError as e:
|
||||
print("Special tokens mapping error:", e)
|
||||
raise
|
||||
|
||||
# 保存tokenizer文件
|
||||
tokenizer.save(os.path.join(save_dir, "tokenizer.json"))
|
||||
|
||||
# 创建配置文件
|
||||
create_tokenizer_config(save_dir)
|
||||
print(f"Tokenizer saved to {save_dir}")
|
||||
```
|
||||
|
||||
(3)训练 BPE Tokenizer
|
||||
|
||||
### Step 5: 使用训练好的 Tokenizer
|
||||
|
||||
我们可以使用训练好的 Tokenizer 来处理文本数据,如编码、解码、生成对话等。下面是一个简单的示例,展示了如何使用训练好的 Tokenizer 来处理文本数据。
|
||||
|
||||
```python
|
||||
# 设置设置BPE训练器
|
||||
trainer = BpeTrainer(vocab_size=32000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>"])
|
||||
# 训练BPE Tokenizer
|
||||
tokenizer.train_from_iterator(batch_iterator(), trainer)
|
||||
# 保存训练好的 Tokenizer
|
||||
tokenizer.save("./output/tokenizer.json")
|
||||
def eval_tokenizer(tokenizer_path: str) -> None:
|
||||
"""评估tokenizer功能"""
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
||||
except Exception as e:
|
||||
print(f"Error loading tokenizer: {e}")
|
||||
return
|
||||
|
||||
# 测试基本属性
|
||||
print("\n=== Tokenizer基本信息 ===")
|
||||
print(f"Vocab size: {len(tokenizer)}")
|
||||
print(f"Special tokens: {tokenizer.all_special_tokens}")
|
||||
print(f"Special token IDs: {tokenizer.all_special_ids}")
|
||||
|
||||
# 测试聊天模板
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个AI助手。"},
|
||||
{"role": "user", "content": "How are you?"},
|
||||
{"role": "assistant", "content": "I'm fine, thank you. and you?"},
|
||||
{"role": "user", "content": "I'm good too."},
|
||||
{"role": "assistant", "content": "That's great to hear!"},
|
||||
]
|
||||
|
||||
print("\n=== 聊天模板测试 ===")
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
# add_generation_prompt=True
|
||||
)
|
||||
print("Generated prompt:\n", prompt, sep="")
|
||||
|
||||
# 测试编码解码
|
||||
print("\n=== 编码解码测试 ===")
|
||||
encoded = tokenizer(prompt, truncation=True, max_length=256)
|
||||
decoded = tokenizer.decode(encoded["input_ids"], skip_special_tokens=False)
|
||||
print("Decoded text matches original:", decoded == prompt)
|
||||
|
||||
# 测试特殊token处理
|
||||
print("\n=== 特殊token处理 ===")
|
||||
test_text = "<|im_start|>user\nHello<|im_end|>"
|
||||
encoded = tokenizer(test_text).input_ids
|
||||
decoded = tokenizer.decode(encoded)
|
||||
print(f"Original: {test_text}")
|
||||
print(f"Decoded: {decoded}")
|
||||
print("Special tokens preserved:", decoded == test_text)
|
||||
```
|
||||
|
||||
在训练过程中,我们需要指定 BPE Tokenizer 的参数,如词典大小、最小词频和特殊标记。这些参数可以根据具体的任务和数据集进行调整,以获得更好的分词效果。
|
||||
|
||||
### Step 4: 使用训练好的 Tokenizer
|
||||
|
||||
(1)使用 Tokenizer 加载训练好的 Tokenizer
|
||||
|
||||
训练完成后,我们可以使用训练好的 Tokenizer 对文本进行分词。首先,我们需要加载训练好的 Tokenizer。
|
||||
|
||||
```python
|
||||
tokenizer = Tokenizer.from_file("./output/tokenizer.json")
|
||||
eval_tokenizer('your tokenizer path')
|
||||
```
|
||||
|
||||
使用 Tokenizer 对文本进行分词
|
||||
OUT:
|
||||
```
|
||||
=== Tokenizer基本信息 ===
|
||||
Vocab size: 6144
|
||||
Special tokens: ['<|im_start|>', '<|im_end|>', '<unk>', '<s>', '</s>']
|
||||
Special token IDs: [3, 4, 0, 1, 2]
|
||||
|
||||
```python
|
||||
# 测试tokenizer
|
||||
encoding = tokenizer.encode("how old are you?heiheihei")
|
||||
print(encoding.tokens)
|
||||
print(encoding.ids)
|
||||
=== 聊天模板测试 ===
|
||||
Generated prompt:
|
||||
<|im_start|>system
|
||||
你是一个AI助手。<|im_end|>
|
||||
<|im_start|>user
|
||||
How are you?<|im_end|>
|
||||
<|im_start|>assistant
|
||||
I'm fine, thank you. and you?<|im_end|>
|
||||
<|im_start|>user
|
||||
I'm good too.<|im_end|>
|
||||
<|im_start|>assistant
|
||||
That's great to hear!<|im_end|>
|
||||
|
||||
# ['how', 'old', 'are', 'you', '?', 'hei', 'hei', 'hei']
|
||||
# [2680, 1575, 1354, 2458, 34, 25088, 25088, 25088]
|
||||
|
||||
=== 编码解码测试 ===
|
||||
Decoded text matches original: False
|
||||
|
||||
=== 特殊token处理 ===
|
||||
Original: <|im_start|>user
|
||||
Hello<|im_end|>
|
||||
Decoded: <|im_start|> user
|
||||
Hello<|im_end|>
|
||||
Special tokens preserved: False
|
||||
```
|
||||
|
||||
在这个例子中,我们使用训练好的 Tokenizer 对输入文本进行分词,得到了分词后的 token 序列。每个 token 都有一个对应的 id,可以用于后续的模型训练和推理。
|
||||
|
||||
(2)使用 transformers 库加载 Tokenizer
|
||||
|
||||
我们可以使用 transformer 库中的 `PreTrainedTokenizerFast` 来加载训练好的 Tokenizer。
|
||||
|
||||
```python
|
||||
# 使用 transformers 库加载 Tokenizer
|
||||
from transformers import PreTrainedTokenizerFast
|
||||
# tokenizer_file 是训练好的 Tokenizer 文件路径
|
||||
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer_test/llama-bpe-tokenizer.json", pad_token="<pad>", bos_token="<s>", eos_token="</s>", unk_token="<unk>")
|
||||
|
||||
fast_tokenizer.encode('how old are you?'), fast_tokenizer.decode(fast_tokenizer.encode('how old are you?'))
|
||||
|
||||
|
||||
# ([2680, 1575, 1354, 2458, 34], 'how old are you?')
|
||||
```
|
||||
|
||||
在这个例子中,我们使用 transformers 库中的 `PreTrainedTokenizerFast` 类加载训练好的 Tokenizer,并使用 `encode()` 和 `decode()` 方法对文本进行分词和解码。
|
||||
|
||||
最后,我们可以将其保存为一个 `AutoTokenizer` 可以直接加载的格式。
|
||||
|
||||
```python
|
||||
fast_tokenizer.save_pretrained("tokenizer_test/llama-bpe-tokenizer")
|
||||
```
|
||||
|
||||
(3)使用 transformers.AutoTokenizer 加载 Tokenizer
|
||||
|
||||
```python
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("tokenizer_test/llama-bpe-tokenizer")
|
||||
|
||||
text = "I am 18 years old!"
|
||||
tokenizer.encode(text), tokenizer.decode(tokenizer.encode(text))
|
||||
|
||||
# ([44, 1286, 1481, 1749, 1575, 4], 'I am 18 years old!')
|
||||
```
|
||||
|
||||
OK,到这里我们已经完成了 BPE Tokenizer 完整的训绋和使用流程。通过训练一个 Tokenizer,我们可以更好地处理文本数据,提高模型的泛化能力和效果。
|
||||
Reference in New Issue
Block a user