{
"cells": [
{
"cell_type": "markdown",
"id": "bb9102c3-5b8d-4295-8f29-113b35ec5679",
"metadata": {},
"source": [
"# 一、LLM 预训练"
]
},
{
"cell_type": "markdown",
"id": "8557a6a6-294a-49c3-a8f6-e58bc3bf443d",
"metadata": {},
"source": [
"1.1 初始化 LLM"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "25f1fad8-772c-474e-a43e-77623106485d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2Config {\n",
" \"_name_or_path\": \"autodl-tmp/qwen-1.5b\",\n",
" \"architectures\": [\n",
" \"Qwen2ForCausalLM\"\n",
" ],\n",
" \"attention_dropout\": 0.0,\n",
" \"bos_token_id\": 151643,\n",
" \"eos_token_id\": 151643,\n",
" \"hidden_act\": \"silu\",\n",
" \"hidden_size\": 1536,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 8960,\n",
" \"max_position_embeddings\": 131072,\n",
" \"max_window_layers\": 28,\n",
" \"model_type\": \"qwen2\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 28,\n",
" \"num_key_value_heads\": 2,\n",
" \"rms_norm_eps\": 1e-06,\n",
" \"rope_theta\": 1000000.0,\n",
" \"sliding_window\": null,\n",
" \"tie_word_embeddings\": true,\n",
" \"torch_dtype\": \"bfloat16\",\n",
" \"transformers_version\": \"4.44.2\",\n",
" \"use_cache\": true,\n",
" \"use_mrope\": false,\n",
" \"use_sliding_window\": false,\n",
" \"vocab_size\": 151936\n",
"}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 加载定义好的模型参数-此处以 Qwen-2.5-1.5B 为例\n",
"# 使用 transforemrs 的 Config 类进行加载\n",
"from transformers import AutoConfig\n",
"\n",
"model_path = \"autodl-tmp/qwen-1.5b\"\n",
"config = AutoConfig.from_pretrained(model_path)\n",
"config"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "82b075a1-4fe9-4abb-b5b4-769d1c1a7156",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training new model from scratch - Total size=1472.20M params\n"
]
}
],
"source": [
"# 使用该配置生成一个定义好的模型\n",
"from transformers import AutoModelForCausalLM\n",
"\n",
"model = AutoModelForCausalLM.from_config(config,trust_remote_code=True)\n",
"model.to(\"cuda\")\n",
"n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())\n",
"print(f\"Training new model from scratch - Total size={n_params/2**20:.2f}M params\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e05ea707-23db-4e67-8b7d-e57d019887dd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2ForCausalLM(\n",
" (model): Qwen2Model(\n",
" (embed_tokens): Embedding(151936, 1536)\n",
" (layers): ModuleList(\n",
" (0-27): 28 x Qwen2DecoderLayer(\n",
" (self_attn): Qwen2SdpaAttention(\n",
" (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n",
" (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
" (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n",
" (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n",
" (rotary_emb): Qwen2RotaryEmbedding()\n",
" )\n",
" (mlp): Qwen2MLP(\n",
" (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
" (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n",
" (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n",
" (act_fn): SiLU()\n",
" )\n",
" (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
" (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
" )\n",
" )\n",
" (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n",
" )\n",
" (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n",
")"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 看一下模型\n",
"model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3408137b-eb50-4119-be1c-7a4ff951ab24",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Qwen2TokenizerFast(name_or_path='autodl-tmp/qwen-1.5b', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n",
"\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t151657: AddedToken(\"
| Step | \n", "Training Loss | \n", "
|---|---|
| 10 | \n", "10.987700 | \n", "
| 20 | \n", "9.160700 | \n", "
| 30 | \n", "8.352700 | \n", "
| 40 | \n", "8.159800 | \n", "
| 50 | \n", "8.042500 | \n", "
| 60 | \n", "8.014400 | \n", "
| 70 | \n", "7.986700 | \n", "
| 80 | \n", "7.951800 | \n", "
| 90 | \n", "7.875500 | \n", "
"
],
"text/plain": [
"