import os from tqdm import tqdm import json # 设置环境变量 os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # 下载预训练数据集 os.system("modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir your_local_dir") # 解压预训练数据集 os.system("tar -xvf your_local_dir/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 -C your_local_dir") # 下载SFT数据集 os.system(f'huggingface-cli download --repo-type dataset --resume-download BelleGroup/train_3.5M_CN --local-dir BelleGroup') # 1 处理预训练数据 def split_text(text, chunk_size=512): """将文本按指定长度切分成块""" return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] input_file = 'mobvoi_seq_monkey_general_open_corpus.jsonl' with open('seq_monkey_datawhale.jsonl', 'a', encoding='utf-8') as pretrain: with open(input_file, 'r', encoding='utf-8') as f: data = f.readlines() for line in tqdm(data, desc=f"Processing lines in {input_file}", leave=False): # 添加行级别的进度条 line = json.loads(line) text = line['text'] chunks = split_text(text) for chunk in chunks: pretrain.write(json.dumps({'text': chunk}, ensure_ascii=False) + '\n') # 2 处理SFT数据 def convert_message(data): """ 将原始数据转换为标准格式 """ message = [ {"role": "system", "content": "你是一个AI助手"}, ] for item in data: if item['from'] == 'human': message.append({'role': 'user', 'content': item['value']}) elif item['from'] == 'assistant': message.append({'role': 'assistant', 'content': item['value']}) return message with open('BelleGroup_sft.jsonl', 'a', encoding='utf-8') as sft: with open('BelleGroup/train_3.5M_CN.json', 'r') as f: data = f.readlines() for item in tqdm(data, desc="Processing", unit="lines"): item = json.loads(item) message = convert_message(item['conversations']) sft.write(json.dumps(message, ensure_ascii=False) + '\n')