Files
happy-llm/docs/chapter5/code/deal_dataset.py
KMnO4-zx ce535629ca docs(chapter5): 更新模型文档并添加数据处理脚本
- 更新LLaMA2模型文档,修正图片引用和编号
- 添加Attention结构示意图
- 新增数据处理脚本download_dataset.sh和deal_dataset.py
- 优化文档中的代码示例说明
2025-06-18 16:26:33 +08:00

49 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import json
from tqdm import tqdm
# pretrain_data 为运行download_dataset.sh时下载的pretrain_data本地路径
pretrain_data = 'your local pretrain_data'
output_pretrain_data = 'seq_monkey_datawhale.jsonl'
# sft_data 为运行download_dataset.sh时下载的sft_data本地路径
sft_data = 'your local sft_data'
output_sft_data = 'BelleGroup_sft.jsonl'
# 1 处理预训练数据
def split_text(text, chunk_size=512):
"""将文本按指定长度切分成块"""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
with open(output_pretrain_data, 'a', encoding='utf-8') as pretrain:
with open(pretrain_data, 'r', encoding='utf-8') as f:
data = f.readlines()
for line in tqdm(data, desc=f"Processing lines in {pretrain_data}", leave=False): # 添加行级别的进度条
line = json.loads(line)
text = line['text']
chunks = split_text(text)
for chunk in chunks:
pretrain.write(json.dumps({'text': chunk}, ensure_ascii=False) + '\n')
# 2 处理SFT数据
def convert_message(data):
"""
将原始数据转换为标准格式
"""
message = [
{"role": "system", "content": "你是一个AI助手"},
]
for item in data:
if item['from'] == 'human':
message.append({'role': 'user', 'content': item['value']})
elif item['from'] == 'assistant':
message.append({'role': 'assistant', 'content': item['value']})
return message
with open(output_sft_data, 'a', encoding='utf-8') as sft:
with open(sft_data, 'r') as f:
data = f.readlines()
for item in tqdm(data, desc="Processing", unit="lines"):
item = json.loads(item)
message = convert_message(item['conversations'])
sft.write(json.dumps(message, ensure_ascii=False) + '\n')