add ch6 code

This commit is contained in:
Logan Zou
2025-04-25 10:04:43 +08:00
parent b034735b42
commit b4327f741a
9 changed files with 1424 additions and 445 deletions

View File

@@ -0,0 +1,13 @@
import os
import json
from tqdm import tqdm
# 下载预训练数据集
# os.system("modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir ./autodl-tmp/dataset/pretrain_data")
# # 解压预训练数据集
# os.system("tar -xvf ./autodl-tmp/dataset/pretrain_data/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2")
# 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 下载SFT数据集
os.system(f'huggingface-cli download --repo-type dataset --resume-download BelleGroup/train_3.5M_CN --local-dir ./autodl-tmp/dataset/sft_data/BelleGroup')