update ch05

2025-02-26 11:24:19 +08:00
parent 2edfb76f7a
commit ca3e727e1c
21 changed files with 13737 additions and 1044 deletions
--- a/模型结构-LLaMA.md
+++ b/模型结构-LLaMA.md
@@ -1,4 +1,4 @@
-# 5.1 动手写一个 LLaMA2 模型
+# 5.1 动手实现一个 LLaMA2 大模型

 Meta（原Facebook）于2023年2月发布第一款基于Transformer结构的大型语言模型-LLaMA，并于同年7月发布同系列模型-LLaMA2。我们在第四章已经学习了解的了LLM，记忆如何训练LLM等等。那本小节我们就来学习，如何动手写一个LLaMA2模型。

@@ -7,28 +7,47 @@ Meta（原Facebook）于2023年2月发布第一款基于Transformer结构的大

 首先我们需要定义一些超参数，这些超参数包括模型的大小、层数、头数、词嵌入维度、隐藏层维度等等。这些超参数可以根据实际情况进行调整。

-这里我们自定义一个`ModelArgs`类，来存储和记录我们的超参数，方便后续修改和直接倒入。
+这里我们自定义一个`ModelConfig`类，来存储和记录我们的超参数，这里我们继承了`PretrainedConfig`类，这是`transformers`库中的参数类，我们可以通过继承这个类来方便的使用`transformers`库中的一些功能，也方便在后续导出Hugging Face模型。

 ```python
-class ModelArgs:
-    # 自定义超参数
-    dim: int = 288  # 模型维度
-    n_layers: int = 6  # Transformer层数
-    n_heads: int = 6  # 注意力机制的头数
-    n_kv_heads: Optional[int] = 6  # 键/值头数，如果未指定，则默认为n_heads
-    vocab_size: int = 32000  # 词汇表大小
-    hidden_dim: Optional[int] = None  # 隐藏层维度，如果未指定，则使用其他规则确定
-    multiple_of: int = 32  # MLP隐藏层大小是这个数的倍数
-    norm_eps: float = 1e-5  # 归一化层的epsilon值
-    max_seq_len: int = 256  # 最大序列长度
-    dropout: float = 0.0  # 丢弃率
+from transformers import PretrainedConfig
+
+class ModelConfig(PretrainedConfig):
+    model_type = "Tiny-K"
+    def __init__(
+            self,
+            dim: int = 768, # 模型维度
+            n_layers: int = 12, # Transformer的层数
+            n_heads: int = 16, # 注意力机制的头数
+            n_kv_heads: int = 8, # 键值头的数量
+            vocab_size: int = 6144, # 词汇表大小
+            hidden_dim: int = None, # 隐藏层维度
+            multiple_of: int = 64, 
+            norm_eps: float = 1e-5, # 归一化层的eps
+            max_seq_len: int = 512, # 最大序列长度
+            dropout: float = 0.0, # dropout概率
+            flash_attn: bool = True, # 是否使用Flash Attention
+            **kwargs,
+    ):
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.vocab_size = vocab_size
+        self.hidden_dim = hidden_dim
+        self.multiple_of = multiple_of
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.dropout = dropout
+        self.flash_attn = flash_attn
+        super().__init__(**kwargs)
 ```

 我们来看一下其中的一些超参数的含义，比如`dim`是模型维度，`n_layers`是Transformer的层数，`n_heads`是注意力机制的头数，`vocab_size`是词汇表大小，`max_seq_len`是输入的最大序列长度等等。上面的代码中也对每一个参数做了详细的注释，在后面的代码中我们会根据这些超参数来构建我们的模型。

-## 5.1.2 构建LLaMA2RMSNorm
+## 5.1.2 构建 RMSNorm

-`LLaMA2RMSNorm`可以用如下的数学公式表示：
+`RMSNorm`可以用如下的数学公式表示：

 $$
 \text{RMSNorm}(x) = \frac{x}{\sqrt{\frac{1}{n}\sum_{i=1}^{n}w_i^2 + \epsilon}}
@@ -42,10 +61,10 @@ $$

 这种归一化有助于通过确保权重的规模不会变得过大或过小来稳定学习过程，这在具有许多层的深度学习模型中特别有用。

-我们可以通过如下代码实现`LLaMA2RMSNorm`：
+我们可以通过如下代码实现`RMSNorm`：

 ```python
-class LLaMA2RMSNorm(nn.Module):
+class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float):
        super().__init__()
        # eps是为了防止除以0的情况
@@ -68,10 +87,10 @@ class LLaMA2RMSNorm(nn.Module):
        return output * self.weight
 ```

-并且，我们可以用下面的代码来对`LLaMA2RMSNorm`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 288])`，与我们输入的形状一致，说明模块的实现是正确的，归一化并不会改变输入的形状。
+并且，我们可以用下面的代码来对`RMSNorm`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 288])`，与我们输入的形状一致，说明模块的实现是正确的，归一化并不会改变输入的形状。

 ```python
-norm = LLaMA2RMSNorm(args.dim, args.norm_eps)
+norm = RMSNorm(args.dim, args.norm_eps)
 x = torch.randn(1, 50, args.dim)
 output = norm(x)
 print(output.shape)
@@ -220,6 +239,8 @@ xq_out.shape, xk_out.shape
 OUT:
 ```
 torch.Size([50, 24]) torch.Size([50, 24])
+
+(torch.Size([1, 50, 6, 48]), torch.Size([1, 50, 6, 48]))
 ```

 ### 5.1.3.3 组装 LLaMA2 Attention
@@ -227,8 +248,8 @@ torch.Size([50, 24]) torch.Size([50, 24])
 在上面我们已经完成了旋转嵌入的实现，接下来我们就可以构建 LLaMA2 Attention 模块了。

 ```python
-class LLaMA2Attention(nn.Module):
-    def __init__(self, args: ModelArgs):
+class Attention(nn.Module):
+    def __init__(self, args: ModelConfig):
        super().__init__()
        # 根据是否指定n_kv_heads，确定用于键（key）和值（value）的头的数量。
        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
@@ -315,11 +336,11 @@ class LLaMA2Attention(nn.Module):
        return output
 ```

-同样大家可以使用下面的代码来对`LLaMA2Attention`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 288])`，与我们输入的形状一致，说明模块的实现是正确的。
+同样大家可以使用下面的代码来对`Attention`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 768])`，与我们输入的形状一致，说明模块的实现是正确的。

 ```python
 # 创建Attention实例
-attention_model = LLaMA2Attention(args)
+attention_model = Attention(args)

 # 模拟输入数据
 batch_size = 1
@@ -340,15 +361,15 @@ print("Output shape:", output.shape)

 OUT:
 ```
-Output shape: torch.Size([1, 50, 288])
+Output shape: torch.Size([1, 50, 768])
 ```

 ## 5.1.4 构建 LLaMA2 MLP模块

-相对于前面我们实现的LLaMA2 Attention模块，LLaMA2 MLP模块的实现要简单一些。我们可以通过如下代码实现`LLaMA2MLP`：
+相对于前面我们实现的LLaMA2 Attention模块，LLaMA2 MLP模块的实现要简单一些。我们可以通过如下代码实现`MLP`：

 ```python
-class LLaMA2MLP(nn.Module):
+class MLP(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
        super().__init__()
        # 如果没有指定隐藏层的维度，我们将其设置为输入维度的4倍
@@ -376,13 +397,13 @@ class LLaMA2MLP(nn.Module):

 我们着重观察一下`forward`函数的实现，首先，输入 `x` 通过第一层线性变换 `self.w1` 和 `SILU` 激活函数，然后，结果乘以输入 `x` 通过第三层线性变换 `self.w3` 的结果，最后，通过第二层线性变换 `self.w2` 和 `dropout` 层，得到最终输出。

-同样大家可以使用下面的代码来对`LLaMAMLP`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 288])`，与我们输入的形状一致，说明模块的实现是正确的。
+同样大家可以使用下面的代码来对`LLaMAMLP`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 768])`，与我们输入的形状一致，说明模块的实现是正确的。

 ```python
 # 创建MLP实例
-mlp = LLaMA2MLP(args.dim, args.hidden_dim, args.multiple_of, args.dropout)
+mlp = MLP(args.dim, args.hidden_dim, args.multiple_of, args.dropout)
 # 随机生成数据
-x = torch.randn(1, 50, 288)
+x = torch.randn(1, 50, args.dim)
 # 运行MLP模型
 output = mlp(x)
 print(output.shape)
@@ -390,7 +411,7 @@ print(output.shape)

 OUT:
 ```
-torch.Size([1, 50, 288])
+torch.Size([1, 50, 768])
 ```

 ## 5.1.5 LLaMA2 Decoder Layer
@@ -398,8 +419,8 @@ torch.Size([1, 50, 288])
 到这里，我们已经实现了`LLaMA2`模型的`Attention`模块和`MLP`模块，接下来我们就可以构建`LLaMA2`的`Decoder Layer`了。

 ```python
-class LLaMA2DecoderLayer(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
+class DecoderLayer(nn.Module):
+    def __init__(self, layer_id: int, args: ModelConfig):
        super().__init__()
        # 定义多头注意力的头数
        self.n_heads = args.n_heads
@@ -408,9 +429,9 @@ class LLaMA2DecoderLayer(nn.Module):
        # 定义每个头的维度，等于输入维度除以头数
        self.head_dim = args.dim // args.n_heads
        # 定义LLaMA2Attention对象，用于进行多头注意力计算
-        self.attention = LLaMA2Attention(args)
+        self.attention = Attention(args)
        # 定义LLaMAMLP对象，用于进行前馈神经网络计算
-        self.feed_forward = LLaMA2MLP(
+        self.feed_forward = MLP(
            dim=args.dim,
            hidden_dim=args.hidden_dim,
            multiple_of=args.multiple_of,
@@ -419,9 +440,9 @@ class LLaMA2DecoderLayer(nn.Module):
        # 定义层的ID
        self.layer_id = layer_id
        # 定义注意力计算的归一化层
-        self.attention_norm = LLaMA2RMSNorm(args.dim, eps=args.norm_eps)
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        # 定义前馈神经网络计算的归一化层
-        self.ffn_norm = LLaMA2RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

    def forward(self, x, freqs_cos, freqs_sin):
        # 前向传播函数
@@ -434,11 +455,11 @@ class LLaMA2DecoderLayer(nn.Module):

 `DecoderLayer`就是把我们上面完成的`Attention`模块和`MLP`模块组合在一起，实现了一个完整的`Transformer`模块。

-同样大家可以使用下面的代码来对`LLaMA2DecoderLayer`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 288])`，与我们输入的形状一致，说明模块的实现是正确的。
+同样大家可以使用下面的代码来对`DecoderLayer`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 50, 768])`，与我们输入的形状一致，说明模块的实现是正确的。

 ```python
 # 创建LLaMADecoderLayer实例
-decoderlayer = LLaMA2DecoderLayer(0, args)
+decoderlayer = DecoderLayer(0, args)

 # 模拟输入数据
 dim = args.dim
@@ -455,19 +476,20 @@ print(out.shape) # 形状和输入的x一样 [batch_size, seq_len, dim]

 OUT:
 ```
-torch.Size([1, 50, 288])
+torch.Size([1, 50, 768])
 ```

 ## 5.1.6 构建 LLaMA2 模型

-好了，我们已经完了上述所有的模块的实现，接下来就是激动人心的时刻，我们可以构建`LLaMA2`模型了。，`LLaMA2`模型就是将`LLaMA2DecoderLayer`模块堆叠起来，构成一个完整的`Transformer`模型。
+好了，我们已经完了上述所有的模块的实现，接下来就是激动人心的时刻，我们可以构建`LLaMA2`模型了。，`LLaMA2`模型就是将`DecoderLayer`模块堆叠起来，构成一个完整的`Transformer`模型。

 ```python
-class LLaMA2Model(nn.Module):
-    last_loss: Optional[torch.Tensor]
+class Transformer(PreTrainedModel):
+    config_class = ModelConfig  # 配置类
+    last_loss: Optional[torch.Tensor] # 记录最后一次计算的损失

-    def __init__(self, args: ModelArgs):
-        super().__init__()
+    def __init__(self, args: ModelConfig = None):
+        super().__init__(args)
        # 初始化模型参数
        self.args = args
        # 词汇表大小
@@ -482,9 +504,9 @@ class LLaMA2Model(nn.Module):
        # Decoder层
        self.layers = torch.nn.ModuleList()
        for layer_id in range(args.n_layers):
-            self.layers.append(LLaMA2DecoderLayer(layer_id, args))
+            self.layers.append(DecoderLayer(layer_id, args))
        # 归一化层
-        self.norm = LLaMA2RMSNorm(args.dim, eps=args.norm_eps)
+        self.norm = RMSNorm(args.dim, eps=args.norm_eps)
        # 输出层
        self.output = nn.Linear(args.dim, args.vocab_size, bias=False)

@@ -505,6 +527,8 @@ class LLaMA2Model(nn.Module):

        # 初始化最后一次前向传播的损失属性
        self.last_loss = None
+        self.OUT = CausalLMOutputWithPast()  # 输出容器
+        self._no_split_modules = [name for name, _ in self.named_modules()]  # 不分割的模块列表

    def _init_weights(self, module):
        # 初始化权重的函数
@@ -515,7 +539,21 @@ class LLaMA2Model(nn.Module):
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
-    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None, **keyargs) -> torch.Tensor:
+        """
+        - tokens: Optional[torch.Tensor], 输入 token 张量。
+        - targets: Optional[torch.Tensor], 目标 token 张量。
+        - kv_cache: bool, 是否使用键值缓存。
+        - keyargs: 其他关键字参数。
+
+        - self.OUT: CausalLMOutputWithPast, 包含 logits 和损失。
+        """
+
+        if 'input_ids' in keyargs:
+            tokens = keyargs['input_ids']
+        if 'attention_mask' in keyargs:
+            targets = keyargs['attention_mask']
+
        # 前向传播函数
        _bsz, seqlen = tokens.shape
        # 通过词嵌入层和Dropout层
@@ -534,34 +572,74 @@ class LLaMA2Model(nn.Module):
        if targets is not None:
            # 如果给定了目标，计算损失
            logits = self.output(h)
-            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=0, reduction='none')
        else:
            # 推理时的小优化：只对最后一个位置的输出进行前向传播
            logits = self.output(h[:, [-1], :]) 
            self.last_loss = None

-        return logits
+        # 设置输出
+        self.OUT.__setitem__('logits', logits)
+        self.OUT.__setitem__('last_loss', self.last_loss)
+        return self.OUT
+
+    
+    @torch.inference_mode()
+    def generate(self, idx, stop_id=None, max_new_tokens=256, temperature=1.0, top_k=None):
+        """
+        给定输入序列 idx（形状为 (bz,seq_len) 的长整型张量），通过多次生成新 token 来完成序列。
+        在 model.eval() 模式下运行。效率较低的采样版本，没有使用键k/v cache。
+        """
+        index = idx.shape[1]
+        for _ in range(max_new_tokens):
+            # 如果序列上下文过长，截断它到最大长度
+            idx_cond = idx if idx.size(1) <= self.args.max_seq_len else idx[:, -self.args.max_seq_len:]
+            
+            # 前向传播获取序列中最后一个位置的 logits
+            logits = self(idx_cond).logits
+            logits = logits[:, -1, :] # 只保留最后一个时间步的输出
+            
+            if temperature == 0.0:
+                # 选择最有可能的索引
+                _, idx_next = torch.topk(logits, k=1, dim=-1)
+            else:
+                # 缩放 logits 并应用 softmax
+                logits = logits / temperature
+                if top_k is not None:
+                    v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                    logits[logits < v[:, [-1]]] = -float('Inf')
+                probs = F.softmax(logits, dim=-1)
+                idx_next = torch.multinomial(probs, num_samples=1)
+            
+
+            if idx_next == stop_id:
+                break
+
+            # 将采样的索引添加到序列中并继续
+            idx = torch.cat((idx, idx_next), dim=1)
+
+        return idx[:, index:] # 只返回生成的token
 ```

-同样大家可以使用下面的代码来对`LLaMA2Model`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 1, 32000])`，与我们输入的形状一致，说明模块的实现是正确的。
+同样大家可以使用下面的代码来对`Transformer`模块进行测试，可以看到代码最终输出的形状为`torch.Size([1, 1, 6144])`，与我们输入的形状一致，说明模块的实现是正确的。

 ```python
 # LLaMA2Model.forward 接受两个参数，tokens和targets，其中tokens是输入的张量, 应为int类型
-x = torch.randint(0, 32000, (1, 50)) # [bs, seq_len]
+x = torch.randint(0, 6144, (1, 50)) # [bs, seq_len]
 # 实例化LLaMA2Model
-model = LLaMA2Model(args=args)
+model = Transformer(args=args)
 # 计算model的全部参数
 num_params = sum(p.numel() for p in model.parameters())
 print('Number of parameters:', num_params)

 out = model(x)
-print(out.shape) # [batch_size, 1, vocab_size]
+print(out.logits.shape) # [batch_size, 1, vocab_size]
 ```

 OUT:
 ```
-Number of parameters: 15191712
-torch.Size([1, 1, 32000])
+Number of parameters: 82594560
+torch.Size([1, 1, 6144])
 ```

 **参考文献**