- 更新LLaMA2模型文档,修正图片引用和编号 - 添加Attention结构示意图 - 新增数据处理脚本download_dataset.sh和deal_dataset.py - 优化文档中的代码示例说明
49 lines
1.8 KiB
Python
49 lines
1.8 KiB
Python
import os
|
||
import json
|
||
from tqdm import tqdm
|
||
|
||
# pretrain_data 为运行download_dataset.sh时,下载的pretrain_data本地路径
|
||
pretrain_data = 'your local pretrain_data'
|
||
output_pretrain_data = 'seq_monkey_datawhale.jsonl'
|
||
|
||
# sft_data 为运行download_dataset.sh时,下载的sft_data本地路径
|
||
sft_data = 'your local sft_data'
|
||
output_sft_data = 'BelleGroup_sft.jsonl'
|
||
|
||
# 1 处理预训练数据
|
||
def split_text(text, chunk_size=512):
|
||
"""将文本按指定长度切分成块"""
|
||
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
||
|
||
with open(output_pretrain_data, 'a', encoding='utf-8') as pretrain:
|
||
with open(pretrain_data, 'r', encoding='utf-8') as f:
|
||
data = f.readlines()
|
||
for line in tqdm(data, desc=f"Processing lines in {pretrain_data}", leave=False): # 添加行级别的进度条
|
||
line = json.loads(line)
|
||
text = line['text']
|
||
chunks = split_text(text)
|
||
for chunk in chunks:
|
||
pretrain.write(json.dumps({'text': chunk}, ensure_ascii=False) + '\n')
|
||
|
||
# 2 处理SFT数据
|
||
def convert_message(data):
|
||
"""
|
||
将原始数据转换为标准格式
|
||
"""
|
||
message = [
|
||
{"role": "system", "content": "你是一个AI助手"},
|
||
]
|
||
for item in data:
|
||
if item['from'] == 'human':
|
||
message.append({'role': 'user', 'content': item['value']})
|
||
elif item['from'] == 'assistant':
|
||
message.append({'role': 'assistant', 'content': item['value']})
|
||
return message
|
||
|
||
with open(output_sft_data, 'a', encoding='utf-8') as sft:
|
||
with open(sft_data, 'r') as f:
|
||
data = f.readlines()
|
||
for item in tqdm(data, desc="Processing", unit="lines"):
|
||
item = json.loads(item)
|
||
message = convert_message(item['conversations'])
|
||
sft.write(json.dumps(message, ensure_ascii=False) + '\n') |