From 56fb0c34d45eb5f06135270111656f69c0b0e768 Mon Sep 17 00:00:00 2001 From: Reagan Zhang <147008135+MengYue-MK2000@users.noreply.github.com> Date: Thu, 19 Jun 2025 16:06:05 +0800 Subject: [PATCH 1/3] Update download_dataset.sh --- docs/chapter5/code/download_dataset.sh | 39 +++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/docs/chapter5/code/download_dataset.sh b/docs/chapter5/code/download_dataset.sh index 9a2892c..429ed04 100644 --- a/docs/chapter5/code/download_dataset.sh +++ b/docs/chapter5/code/download_dataset.sh @@ -1,3 +1,4 @@ +# MacOS 系统下载方式 #!/bin/bash # 设置环境变量 @@ -17,4 +18,40 @@ huggingface-cli download \ --repo-type dataset \ --resume-download \ BelleGroup/train_3.5M_CN \ - --local-dir "${dataset_dir}/BelleGroup" \ No newline at end of file + --local-dir "${dataset_dir}/BelleGroup" + +# Windows下载方式 +# 使用PowerShell下载 +# 暂时为当前PowerShell界面设置环境,关闭Powershell环境自动消失 +$env:HF_ENDPOINT = "https://hf-mirror.com" + +# 将\path\to\your\dataset替换成想要下载dataset目录地址 +$dataset_dir = "\path\to\your\dataset" + +# 需要预先安装modelscope,使用pip install modelscope安装 +modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir "$dataset_dir" + +tar -xvf "$dataset_dir\mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "$dataset_dir" + +huggingface-cli download ` + --repo-type dataset ` + --resume-download ` + BelleGroup/train_3.5M_CN ` + --local-dir "$dataset_dir\BelleGroup" + +# 使用CMD下载 +# 暂时为当前CMD界面设置环境,关闭CMD环境自动消失 +set HF_ENDPOINT=https://hf-mirror.com + +# 将\path\to\your\dataset替换成想要下载dataset目录地址 +set dataset_dir=\path\to\your\dataset + +modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir %dataset_dir% + +tar -xvf "%dataset_dir%\mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "%dataset_dir%" + +huggingface-cli download ^ + --repo-type dataset ^ + --resume-download ^ + BelleGroup/train_3.5M_CN ^ + --local-dir "%dataset_dir%\BelleGroup" From 18ff1a73a8ad7d97d9e39290cdf2d8ff99ec1a67 Mon Sep 17 00:00:00 2001 From: Reagan Zhang <147008135+MengYue-MK2000@users.noreply.github.com> Date: Thu, 19 Jun 2025 16:09:59 +0800 Subject: [PATCH 2/3] Update download_dataset.sh Update Mac installation for modelscope --- docs/chapter5/code/download_dataset.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/chapter5/code/download_dataset.sh b/docs/chapter5/code/download_dataset.sh index 429ed04..2a5a076 100644 --- a/docs/chapter5/code/download_dataset.sh +++ b/docs/chapter5/code/download_dataset.sh @@ -7,7 +7,7 @@ export HF_ENDPOINT=https://hf-mirror.com # dataset dir 下载到本地目录 dataset_dir="your local dataset dir" -# 下载预训练数据集 +# 下载预训练数据集, 需要预先安装modelscope,使用pip3 install modelscope安装 modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir ${dataset_dir} # 解压预训练数据集 From b1ac936d362901f7857a409b0dcfa2494c46e68e Mon Sep 17 00:00:00 2001 From: MengYue-MK2000 Date: Thu, 19 Jun 2025 17:52:24 +0800 Subject: [PATCH 3/3] created windows_download_dataset.sh, deleted original changes in download_dataset.sh --- docs/chapter5/code/download_dataset.sh | 37 ------------------- .../chapter5/code/windows_download_dataset.sh | 35 ++++++++++++++++++ 2 files changed, 35 insertions(+), 37 deletions(-) create mode 100644 docs/chapter5/code/windows_download_dataset.sh diff --git a/docs/chapter5/code/download_dataset.sh b/docs/chapter5/code/download_dataset.sh index 2a5a076..470906f 100644 --- a/docs/chapter5/code/download_dataset.sh +++ b/docs/chapter5/code/download_dataset.sh @@ -1,4 +1,3 @@ -# MacOS 系统下载方式 #!/bin/bash # 设置环境变量 @@ -19,39 +18,3 @@ huggingface-cli download \ --resume-download \ BelleGroup/train_3.5M_CN \ --local-dir "${dataset_dir}/BelleGroup" - -# Windows下载方式 -# 使用PowerShell下载 -# 暂时为当前PowerShell界面设置环境,关闭Powershell环境自动消失 -$env:HF_ENDPOINT = "https://hf-mirror.com" - -# 将\path\to\your\dataset替换成想要下载dataset目录地址 -$dataset_dir = "\path\to\your\dataset" - -# 需要预先安装modelscope,使用pip install modelscope安装 -modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir "$dataset_dir" - -tar -xvf "$dataset_dir\mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "$dataset_dir" - -huggingface-cli download ` - --repo-type dataset ` - --resume-download ` - BelleGroup/train_3.5M_CN ` - --local-dir "$dataset_dir\BelleGroup" - -# 使用CMD下载 -# 暂时为当前CMD界面设置环境,关闭CMD环境自动消失 -set HF_ENDPOINT=https://hf-mirror.com - -# 将\path\to\your\dataset替换成想要下载dataset目录地址 -set dataset_dir=\path\to\your\dataset - -modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir %dataset_dir% - -tar -xvf "%dataset_dir%\mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "%dataset_dir%" - -huggingface-cli download ^ - --repo-type dataset ^ - --resume-download ^ - BelleGroup/train_3.5M_CN ^ - --local-dir "%dataset_dir%\BelleGroup" diff --git a/docs/chapter5/code/windows_download_dataset.sh b/docs/chapter5/code/windows_download_dataset.sh new file mode 100644 index 0000000..8a18543 --- /dev/null +++ b/docs/chapter5/code/windows_download_dataset.sh @@ -0,0 +1,35 @@ +# Windows下载方式 +# 使用PowerShell下载 +# 暂时为当前PowerShell界面设置环境,关闭Powershell环境自动消失 +$env:HF_ENDPOINT = "https://hf-mirror.com" + +# 将\path\to\your\dataset替换成想要下载dataset目录地址 +$dataset_dir = "\path\to\your\dataset" + +# 需要预先安装modelscope,使用pip install modelscope安装 +modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir "$dataset_dir" + +tar -xvf "$dataset_dir\mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "$dataset_dir" + +huggingface-cli download ` + --repo-type dataset ` + --resume-download ` + BelleGroup/train_3.5M_CN ` + --local-dir "$dataset_dir\BelleGroup" + +# 使用CMD下载 +# 暂时为当前CMD界面设置环境,关闭CMD环境自动消失 +set HF_ENDPOINT=https://hf-mirror.com + +# 将\path\to\your\dataset替换成想要下载dataset目录地址 +set dataset_dir=\path\to\your\dataset + +modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir %dataset_dir% + +tar -xvf "%dataset_dir%\mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2" -C "%dataset_dir%" + +huggingface-cli download ^ + --repo-type dataset ^ + --resume-download ^ + BelleGroup/train_3.5M_CN ^ + --local-dir "%dataset_dir%\BelleGroup"