update ch05
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -8,39 +8,8 @@ from torch.utils.data import Dataset, DataLoader
|
||||
import torch
|
||||
from sklearn.model_selection import train_test_split
|
||||
import os
|
||||
|
||||
|
||||
class PretrainDataset(Dataset):
|
||||
def __init__(self, df, tokenizer, max_length=512):
|
||||
super().__init__()
|
||||
self.df = df
|
||||
self.tokenizer = tokenizer
|
||||
self.max_length = max_length
|
||||
self.padding = 0
|
||||
|
||||
def __len__(self):
|
||||
return self.df.shape[0]
|
||||
|
||||
def __getitem__(self, index: int):
|
||||
#
|
||||
sample = self.df.iloc[index]
|
||||
text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}"
|
||||
input_id = self.tokenizer(text).data['input_ids'][:self.max_length]
|
||||
text_len = len(input_id)
|
||||
# 没满最大长度的剩余部分
|
||||
padding_len = self.max_length - text_len
|
||||
input_id = input_id + [self.padding] * padding_len
|
||||
# 0表示不计算损失
|
||||
loss_mask = [1] * text_len + [0] * padding_len
|
||||
|
||||
input_id = np.array(input_id)
|
||||
X = np.array(input_id[:-1]).astype(np.int64)
|
||||
Y = np.array(input_id[1:]).astype(np.int64)
|
||||
loss_mask = np.array(loss_mask[1:]).astype(np.int64)
|
||||
return torch.from_numpy(X), torch.from_numpy(Y), torch.from_numpy(loss_mask)
|
||||
|
||||
|
||||
class SkyWorkPretrainDataset(Dataset):
|
||||
class PretrainDataset(Dataset):
|
||||
def __init__(self, data_path, tokenizer, max_length=512):
|
||||
super().__init__()
|
||||
self.data_path = data_path
|
||||
|
||||
@@ -13,7 +13,7 @@ from contextlib import nullcontext
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from k_model import ModelConfig, Transformer
|
||||
from dataset import PretrainDataset, SkyWorkPretrainDataset
|
||||
from dataset import PretrainDataset
|
||||
|
||||
import swanlab
|
||||
|
||||
@@ -131,7 +131,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
||||
parser.add_argument("--use_swanlab", type=bool, default=True, help="Use Weights & Biases")
|
||||
parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading")
|
||||
parser.add_argument("--data_path", type=str, default="/home/user/szx/dataset/seq-monkey/seq_monkey_datawhale.jsonl", help="Path to training data")
|
||||
parser.add_argument("--data_path", type=str, default="", help="Path to training data")
|
||||
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
|
||||
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
||||
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
|
||||
@@ -152,7 +152,7 @@ if __name__ == "__main__":
|
||||
args.device = "cpu"
|
||||
|
||||
if args.use_swanlab:
|
||||
swanlab.login(api_key='BIYVGq2rfWmD9sFMCehUG')
|
||||
swanlab.login(api_key='your key')
|
||||
run = swanlab.init(
|
||||
project="Tiny-LLM",
|
||||
experiment_name="Pretrain-215M",
|
||||
@@ -174,7 +174,7 @@ if __name__ == "__main__":
|
||||
|
||||
model, tokenizer = init_model()
|
||||
|
||||
train_ds = SkyWorkPretrainDataset(args.data_path, tokenizer, max_length=max_seq_len)
|
||||
train_ds = PretrainDataset(args.data_path, tokenizer, max_length=max_seq_len)
|
||||
train_loader = DataLoader(
|
||||
train_ds,
|
||||
batch_size=args.batch_size,
|
||||
|
||||
@@ -139,7 +139,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
||||
parser.add_argument("--use_swanlab", type=bool, default=True, help="Use Weights & Biases")
|
||||
parser.add_argument("--num_workers", type=int, default=4, help="Number of workers for data loading")
|
||||
parser.add_argument("--data_path", type=str, default="/home/user/szx/dataset/BelleGroup/sft.jsonl", help="Path to training data")
|
||||
parser.add_argument("--data_path", type=str, default="", help="Path to training data")
|
||||
parser.add_argument("--accumulation_steps", type=int, default=4, help="Gradient accumulation steps")
|
||||
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
||||
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
|
||||
@@ -160,7 +160,7 @@ if __name__ == "__main__":
|
||||
args.device = "cpu"
|
||||
|
||||
if args.use_swanlab:
|
||||
swanlab.login(api_key='BIYVGq2rfWmD9sFMCehUG')
|
||||
swanlab.login(api_key='your key')
|
||||
run = swanlab.init(
|
||||
project="Tiny-LLM",
|
||||
experiment_name="BelleGropu-sft-215M",
|
||||
|
||||
@@ -417,7 +417,7 @@ class Transformer(PreTrainedModel):
|
||||
return idx[:, index:] # 只返回生成的token
|
||||
|
||||
if __name__ == '__main__':
|
||||
tokenizer = AutoTokenizer.from_pretrained("/home/user/szx/code/k-llm/tokenizer_k")
|
||||
tokenizer = AutoTokenizer.from_pretrained("tokenizer_k")
|
||||
args = ModelConfig(
|
||||
dim=1024,
|
||||
n_layers=18,
|
||||
|
||||
@@ -8,7 +8,7 @@ import argparse
|
||||
|
||||
class TextGenerator:
|
||||
def __init__(self,
|
||||
checkpoint=None, # 模型检查点路径
|
||||
checkpoint='out/SkyWork_pretrain_768_12_6144.pth', # 模型检查点路径
|
||||
tokenizer_model_path='./tokenizer_k/', # 分词器模型路径
|
||||
seed=42, # 随机种子,确保可重复性
|
||||
device=None, # 设备,优先使用 CUDA,如果没有可用的 CUDA,则使用 CPU
|
||||
@@ -33,8 +33,15 @@ class TextGenerator:
|
||||
# 根据 dtype 选择适当的自动混合精度上下文
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[self.dtype]
|
||||
self.ctx = nullcontext() if self.device_type == 'cpu' else torch.amp.autocast(device_type=self.device_type, dtype=ptdtype)
|
||||
|
||||
self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint, trust_remote_code=True)
|
||||
|
||||
# 加载模型检查点文件
|
||||
checkpoint_dict = torch.load(self.checkpoint, map_location=self.device) # 加载模型参数 # 初始化模型参数
|
||||
self.model = Transformer(ModelConfig(dim=1024, n_layers=18)) # 实例化 Transformer 模型
|
||||
sunwanted_prefix = '_orig_mod.'
|
||||
for k, v in list(checkpoint_dict.items()):
|
||||
if k.startswith(sunwanted_prefix):
|
||||
checkpoint_dict[k[len(sunwanted_prefix):]] = checkpoint_dict.pop(k)
|
||||
self.model.load_state_dict(checkpoint_dict, strict=False)
|
||||
|
||||
# 计算模型参数量
|
||||
num_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
|
||||
@@ -72,8 +79,8 @@ class TextGenerator:
|
||||
start = self.chat_template(start)
|
||||
# 将起始文本编码为 token id 序列
|
||||
start_ids = self.tokenizer(start).data['input_ids']
|
||||
# print('start_ids:', start_ids)
|
||||
x = (torch.tensor(start_ids, dtype=torch.long, device=self.device)[None, ...]) # 将编码后的 token id 转为 PyTorch 张量
|
||||
# print(self.tokenizer.eos_token_id)
|
||||
generated_texts = [] # 用于保存生成的文本样本
|
||||
with torch.no_grad(): # 禁用梯度计算,提升效率
|
||||
with self.ctx: # 进入自动混合精度的上下文(如果是 GPU 并使用 float16 时)
|
||||
@@ -81,34 +88,64 @@ class TextGenerator:
|
||||
y = self.model.generate(x, self.tokenizer.eos_token_id, max_new_tokens, temperature=temperature, top_k=top_k) # 生成文本
|
||||
generated_texts.append(self.tokenizer.decode(y[0].tolist())) # 解码生成的 token 序列为可读文本
|
||||
return generated_texts # 返回生成的文本样本
|
||||
|
||||
|
||||
def pretrain_sample(self,
|
||||
start="Hello!", # 生成文本的起始提示词,可以是任意字符串
|
||||
num_samples=3, # 生成样本的数量,默认生成 3 个样本
|
||||
max_new_tokens=256, # 每个样本生成的最大 token 数,默认最多生成 256 个 token
|
||||
temperature=0.7, # 控制生成的随机性,1.0 为标准,值越大越随机
|
||||
top_k=300): # 保留概率最高的 top_k 个 token,限制生成时的选择范围
|
||||
"""
|
||||
根据给定的起始文本生成样本。
|
||||
|
||||
:param start: 生成文本的起始提示词
|
||||
:param num_samples: 要生成的文本样本数
|
||||
:param max_new_tokens: 每个样本生成的最大 token 数
|
||||
:param temperature: 控制生成的随机性,值越小生成越确定,值越大生成越随机
|
||||
:param top_k: 限制生成时选择的 token 范围
|
||||
:return: 生成的文本样本列表
|
||||
"""
|
||||
# 如果 start 是以 'FILE:' 开头,表示从文件中读取起始文本
|
||||
if start.startswith('FILE:'):
|
||||
with open(start[5:], 'r', encoding='utf-8') as f:
|
||||
start = f.read() # 读取文件内容作为起始文本
|
||||
|
||||
# 将起始文本编码为 token id 序列
|
||||
start_ids = self.tokenizer(start).data['input_ids']
|
||||
# print('start_ids:', start_ids)
|
||||
x = (torch.tensor(start_ids, dtype=torch.long, device=self.device)[None, ...]) # 将编码后的 token id 转为 PyTorch 张量
|
||||
# print(x.shape)
|
||||
generated_texts = [] # 用于保存生成的文本样本
|
||||
with torch.no_grad(): # 禁用梯度计算,提升效率
|
||||
with self.ctx: # 进入自动混合精度的上下文(如果是 GPU 并使用 float16 时)
|
||||
for k in range(num_samples): # 循环生成指定数量的样本
|
||||
y = self.model.generate(x, max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k) # 生成文本
|
||||
generated_texts.append(self.tokenizer.decode(y[0].tolist())) # 解码生成的 token 序列为可读文本
|
||||
|
||||
return generated_texts # 返回生成的文本样本
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
print("\n ------------------- SFT Sample ------------------- \n")
|
||||
sft_prompt_datas = [
|
||||
'你好呀',
|
||||
"中国的首都是哪里?",
|
||||
"1+9等于几",
|
||||
"1+3等于几",
|
||||
"单片机是什么?",
|
||||
"你是谁?",
|
||||
"谁创造了你?",
|
||||
"1+1等于多少?",
|
||||
]
|
||||
generator = TextGenerator(checkpoint='./k-model-82M/') # 初始化生成器
|
||||
generator = TextGenerator(checkpoint='./BeelGroup_sft_model_215M/sft_dim1024_layers18_vocab_size6144.pth') # 初始化生成器
|
||||
for i in range(len(sft_prompt_datas)):
|
||||
samples = generator.sft_sample(start=sft_prompt_datas[i], num_samples=1, max_new_tokens=512, temperature=0.75)
|
||||
print(f"\nSample {i+1}:\nQuestion: {sft_prompt_datas[i]} \nAI answer: {samples[0]}\n{'-'*20}") # 打印生成的样本并用分隔线分割
|
||||
|
||||
|
||||
# print("\n ------------------- Pretrain Sample ------------------- \n")
|
||||
print("------------------- Pretrain Sample ------------------- \n")
|
||||
|
||||
# pretrain_prompt_datas = [
|
||||
# '<|im_start|>近年来,单片机以其体积小、价格廉、面向控制等独特优点',
|
||||
# '<|im_start|>明正德年间,迟姓由云南迁来居住,因靠磨山',
|
||||
# '<|im_start|>中国矿业大学-北京(CUMTB)是一所以矿业为特色,工',
|
||||
# ]
|
||||
pretrain_prompt_datas = [
|
||||
'<|im_start|>北京大学是',
|
||||
'<|im_start|>中国矿业大学(北京)地球科学与测绘工程学院',
|
||||
]
|
||||
|
||||
# generator = TextGenerator(checkpoint='base_model/SkyWork_pretrain_768_12_6144.pth') # 初始化生成器
|
||||
# for i in range(len(pretrain_prompt_datas)):
|
||||
# samples = generator.pretrain_sample(start=pretrain_prompt_datas[i], num_samples=1, max_new_tokens=50, temperature=0.75)
|
||||
# print(f"\nSample {i+1}:\nQuestion: {pretrain_prompt_datas[i]} \nAI answer: {samples[0]}\n{'-'*20}") # 打印生成的样本并用分隔线分割
|
||||
generator = TextGenerator(checkpoint='./base_monkey_215M/pretrain_1024_18_6144.pth') # 初始化生成器
|
||||
for i in range(len(pretrain_prompt_datas)):
|
||||
samples = generator.pretrain_sample(start=pretrain_prompt_datas[i], num_samples=1, max_new_tokens=120, temperature=1.0)
|
||||
print(f"\nSample {i+1}:\n{pretrain_prompt_datas[i]}{samples[0]}\n{'-'*20}") # 打印生成的样本并用分隔线分割
|
||||
@@ -1,66 +0,0 @@
|
||||
import json
|
||||
import random
|
||||
import numpy as np
|
||||
import streamlit as st
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
# from transformers.generation.utils import GenerationConfig
|
||||
|
||||
st.set_page_config(page_title="K-Model-215M LLM")
|
||||
st.title("K-Model-215M LLM")
|
||||
st.caption("🚀 A streamlit chatbot powered by Self-LLM")
|
||||
|
||||
|
||||
with st.sidebar:
|
||||
st.markdown("## K-Model-215M LLM")
|
||||
"[开源大模型食用指南 self-llm](https://github.com/datawhalechina/self-llm.git)"
|
||||
# 创建一个滑块,用于选择最大长度,范围在 0 到 8192 之间,默认值为 512(Qwen2.5 支持 128K 上下文,并能生成最多 8K tokens)
|
||||
st.sidebar.title("设定调整")
|
||||
st.session_state.max_new_tokens = st.sidebar.slider("最大输入/生成长度", 128, 512, 512, step=1)
|
||||
st.session_state.temperature = st.sidebar.slider("temperature", 0.1, 1.2, 0.75, step=0.01)
|
||||
|
||||
|
||||
model_id = "./k-model-215M/"
|
||||
|
||||
# 定义一个函数,用于获取模型和 tokenizer
|
||||
@st.cache_resource
|
||||
def get_model():
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto").eval()
|
||||
return tokenizer, model
|
||||
|
||||
|
||||
tokenizer, model = get_model()
|
||||
|
||||
# 如果 session_state 中没有 "messages",则创建一个包含默认消息的列表
|
||||
if "messages" not in st.session_state:
|
||||
st.session_state["messages"] = [{"role": "assistant", "content": "有什么可以帮您的?"}]
|
||||
|
||||
# 遍历 session_state 中的所有消息,并显示在聊天界面上
|
||||
for msg in st.session_state.messages:
|
||||
st.chat_message(msg["role"]).write(msg["content"])
|
||||
|
||||
# 如果用户在聊天输入框中输入了内容,则执行以下操作
|
||||
if prompt := st.chat_input():
|
||||
|
||||
# 在聊天界面上显示用户的输入
|
||||
st.chat_message("user").write(prompt)
|
||||
|
||||
# 将用户输入添加到 session_state 中的 messages 列表中
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
|
||||
# 将对话输入模型,获得返回
|
||||
input_ids = tokenizer.apply_chat_template(st.session_state.messages,tokenize=False,add_generation_prompt=True)
|
||||
input_ids = tokenizer(input_ids).data['input_ids']
|
||||
x = (torch.tensor(input_ids, dtype=torch.long)[None, ...])
|
||||
|
||||
with torch.no_grad():
|
||||
y = model.generate(x, tokenizer.eos_token_id, st.max_new_tokens, temperature=st.temperature)
|
||||
response = tokenizer.decode(y[0].tolist())
|
||||
|
||||
# 将模型的输出添加到 session_state 中的 messages 列表中
|
||||
st.session_state.messages.append({"role": "assistant", "content": response})
|
||||
# 在聊天界面上显示模型的输出
|
||||
st.chat_message("assistant").write(response)
|
||||
# print(st.session_state) # 打印 session_state 调试
|
||||
|
||||
BIN
docs/chapter5/images/pretrain_dataset.png
Normal file
BIN
docs/chapter5/images/pretrain_dataset.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 23 KiB |
BIN
docs/chapter5/images/sftdataset.png
Normal file
BIN
docs/chapter5/images/sftdataset.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
Reference in New Issue
Block a user