diff --git a/docs/chapter5/code/download.py b/docs/chapter5/code/deal_dataset.py similarity index 54% rename from docs/chapter5/code/download.py rename to docs/chapter5/code/deal_dataset.py index 50c2aaf..8d5c049 100644 --- a/docs/chapter5/code/download.py +++ b/docs/chapter5/code/deal_dataset.py @@ -1,32 +1,24 @@ -import os -from tqdm import tqdm +import os import json +from tqdm import tqdm -# 设置环境变量 -os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' - - -# 下载预训练数据集 -os.system("modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir your_local_dir") -# 解压预训练数据集 -os.system("tar -xvf your_local_dir/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 -C your_local_dir") - -# 下载SFT数据集 -os.system(f'huggingface-cli download --repo-type dataset --resume-download BelleGroup/train_3.5M_CN --local-dir BelleGroup') - +# pretrain_data 为运行download_dataset.sh时,下载的pretrain_data本地路径 +pretrain_data = 'your local pretrain_data' +output_pretrain_data = 'seq_monkey_datawhale.jsonl' +# sft_data 为运行download_dataset.sh时,下载的sft_data本地路径 +sft_data = 'your local sft_data' +output_sft_data = 'BelleGroup_sft.jsonl' # 1 处理预训练数据 def split_text(text, chunk_size=512): """将文本按指定长度切分成块""" return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] -input_file = 'mobvoi_seq_monkey_general_open_corpus.jsonl' - -with open('seq_monkey_datawhale.jsonl', 'a', encoding='utf-8') as pretrain: - with open(input_file, 'r', encoding='utf-8') as f: +with open(output_pretrain_data, 'a', encoding='utf-8') as pretrain: + with open(pretrain_data, 'r', encoding='utf-8') as f: data = f.readlines() - for line in tqdm(data, desc=f"Processing lines in {input_file}", leave=False): # 添加行级别的进度条 + for line in tqdm(data, desc=f"Processing lines in {pretrain_data}", leave=False): # 添加行级别的进度条 line = json.loads(line) text = line['text'] chunks = split_text(text) @@ -34,7 +26,6 @@ with open('seq_monkey_datawhale.jsonl', 'a', encoding='utf-8') as pretrain: pretrain.write(json.dumps({'text': chunk}, ensure_ascii=False) + '\n') # 2 处理SFT数据 - def convert_message(data): """ 将原始数据转换为标准格式 @@ -49,10 +40,10 @@ def convert_message(data): message.append({'role': 'assistant', 'content': item['value']}) return message -with open('BelleGroup_sft.jsonl', 'a', encoding='utf-8') as sft: - with open('BelleGroup/train_3.5M_CN.json', 'r') as f: +with open(output_sft_data, 'a', encoding='utf-8') as sft: + with open(sft_data, 'r') as f: data = f.readlines() for item in tqdm(data, desc="Processing", unit="lines"): item = json.loads(item) message = convert_message(item['conversations']) - sft.write(json.dumps(message, ensure_ascii=False) + '\n') + sft.write(json.dumps(message, ensure_ascii=False) + '\n') \ No newline at end of file diff --git a/docs/chapter5/code/download_dataset.sh b/docs/chapter5/code/download_dataset.sh new file mode 100644 index 0000000..9a2892c --- /dev/null +++ b/docs/chapter5/code/download_dataset.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# 设置环境变量 +export HF_ENDPOINT=https://hf-mirror.com + +# dataset dir 下载到本地目录 +dataset_dir="your local dataset dir" + +# 下载预训练数据集 +modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir ${dataset_dir} + +# 解压预训练数据集 +tar -xvf "${dataset_dir}/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "${dataset_dir}" + +# 下载SFT数据集 +huggingface-cli download \ + --repo-type dataset \ + --resume-download \ + BelleGroup/train_3.5M_CN \ + --local-dir "${dataset_dir}/BelleGroup" \ No newline at end of file diff --git a/docs/chapter5/第五章 动手搭建大模型.md b/docs/chapter5/第五章 动手搭建大模型.md index 9f391ee..c42a156 100644 --- a/docs/chapter5/第五章 动手搭建大模型.md +++ b/docs/chapter5/第五章 动手搭建大模型.md @@ -4,11 +4,11 @@ Meta(原Facebook)于2023年2月发布第一款基于Transformer结构的大型语言模型LLaMA,并于同年7月发布同系列模型LLaMA2。我们在第四章已经学习了解的了LLM,记忆如何训练LLM等等。那本小节我们就来学习,如何动手写一个LLaMA2模型。 -LLaMA2 模型结构如下图5.0所示: +LLaMA2 模型结构如下图5.1所示:
- 图 5.0 LLaMA2结构
+图 5.1 LLaMA2结构
+ 图 5.2 LLaMA2 Attention 结构
+
- 图5.1 预训练损失函数计算
+图5.3 预训练损失函数计算
- 图5.2 SFT 损失函数计算
+图5.4 SFT 损失函数计算