From b4327f741aa372d128890fb0b1d2296a54d55108 Mon Sep 17 00:00:00 2001 From: Logan Zou Date: Fri, 25 Apr 2025 10:04:43 +0800 Subject: [PATCH] add ch6 code --- docs/chapter6/code/download_dataset.py | 13 + docs/chapter6/code/download_model.py | 7 + docs/chapter6/code/ds_config_zero2.json | 52 ++ docs/chapter6/code/finetune.py | 269 +++++++ docs/chapter6/code/finetune.sh | 27 + docs/chapter6/code/pretrain.ipynb | 882 +++++++++++++++++++++++ docs/chapter6/code/pretrain.py | 546 +++----------- docs/chapter6/code/pretrain.sh | 29 + docs/chapter6/code/process_dataset.ipynb | 44 ++ 9 files changed, 1424 insertions(+), 445 deletions(-) create mode 100644 docs/chapter6/code/download_dataset.py create mode 100644 docs/chapter6/code/download_model.py create mode 100644 docs/chapter6/code/ds_config_zero2.json create mode 100644 docs/chapter6/code/finetune.py create mode 100644 docs/chapter6/code/finetune.sh create mode 100644 docs/chapter6/code/pretrain.ipynb create mode 100644 docs/chapter6/code/pretrain.sh create mode 100644 docs/chapter6/code/process_dataset.ipynb diff --git a/docs/chapter6/code/download_dataset.py b/docs/chapter6/code/download_dataset.py new file mode 100644 index 0000000..00ef519 --- /dev/null +++ b/docs/chapter6/code/download_dataset.py @@ -0,0 +1,13 @@ +import os +import json +from tqdm import tqdm + +# 下载预训练数据集 +# os.system("modelscope download --dataset ddzhu123/seq-monkey mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2 --local_dir ./autodl-tmp/dataset/pretrain_data") +# # 解压预训练数据集 +# os.system("tar -xvf ./autodl-tmp/dataset/pretrain_data/mobvoi_seq_monkey_general_open_corpus.jsonl.tar.bz2") + +# 设置环境变量 +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' +# 下载SFT数据集 +os.system(f'huggingface-cli download --repo-type dataset --resume-download BelleGroup/train_3.5M_CN --local-dir ./autodl-tmp/dataset/sft_data/BelleGroup') diff --git a/docs/chapter6/code/download_model.py b/docs/chapter6/code/download_model.py new file mode 100644 index 0000000..83f4365 --- /dev/null +++ b/docs/chapter6/code/download_model.py @@ -0,0 +1,7 @@ +import os + +# 设置环境变量 +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' + +# 下载模型 +os.system('huggingface-cli download --resume-download Qwen/Qwen2.5-1.5B --local-dir autodl-tmp/qwen-1.5b') \ No newline at end of file diff --git a/docs/chapter6/code/ds_config_zero2.json b/docs/chapter6/code/ds_config_zero2.json new file mode 100644 index 0000000..4be2c0b --- /dev/null +++ b/docs/chapter6/code/ds_config_zero2.json @@ -0,0 +1,52 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/docs/chapter6/code/finetune.py b/docs/chapter6/code/finetune.py new file mode 100644 index 0000000..3916832 --- /dev/null +++ b/docs/chapter6/code/finetune.py @@ -0,0 +1,269 @@ +''' +SFT 脚本 +''' + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from torchdata.datapipes.iter import IterableWrapper +from itertools import chain +import deepspeed +from typing import Optional,List,Dict +from torch.utils.data import Dataset +import json + + +import datasets +import pandas as pd +import torch +from datasets import load_dataset +import transformers +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +import datetime +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +import wandb +from tqdm import tqdm + + +logger = logging.getLogger(__name__) + + +# 超参类 +@dataclass +class ModelArguments: + """ + 关于模型的参数 + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "预训练模型参数地址" + ) + }, + ) + torch_dtype: Optional[str] = field( + default=None, + metadata={ + "help": ( + "模型训练使用的数据类型,推荐 bfloat16" + ), + "choices": ["auto", "bfloat16", "float16", "float32"], + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + 关于训练的参数 + """ + + train_files: Optional[str] = field(default=None, metadata={"help": "训练数据路径"}) + block_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "最大文本块长度" + ) + }, + ) + +# 指令文本处理 +# 参考:https://github.com/QwenLM/Qwen/blob/main/finetune.py +def preprocess(sources, tokenizer, max_len, system_message: str = "You are a helpful assistant."): + # prompt 模板 + roles = {"human": "<|im_start|>human", "assistant": "<|im_start|>assistant"} + + # 不同的 tokenizer 需要特别定义 + # BOS + im_start = tokenizer("<|im_start|>").input_ids + # EOS + im_end = tokenizer("<|im_end|>").input_ids + # PAD + IGNORE_TOKEN_ID = tokenizer.pad_token_id + # 换行符 + nl_tokens = tokenizer('\n').input_ids + # 角色标识符 + _system = tokenizer('system').input_ids + nl_tokens + _user = tokenizer('human').input_ids + nl_tokens + _assistant = tokenizer('assistant').input_ids + nl_tokens + + # 拼接多轮对话 + input_ids, targets = [], [] + for i in tqdm(range(len(sources))): + source = sources[i] + # 从 user 开始 + if source[0]["from"] != "human": + source = source[1:] + # 分别是输入和输出 + input_id, target = [], [] + # system: 【BOS】system\nYou are a helpful assistant.【EOS】\n + system = im_start + _system + tokenizer(system_message).input_ids + im_end + nl_tokens + input_id += system + # system 不需要拟合 + target += im_start + [IGNORE_TOKEN_ID] * (len(system)-3) + im_end + nl_tokens + assert len(input_id) == len(target) + # 依次拼接 + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + # user:<|im_start|>human\ninstruction【EOS】\n + # assistant:<|im_start|>assistant\nresponse【EOS】\n + _input_id = tokenizer(role).input_ids + nl_tokens + \ + tokenizer(sentence["value"]).input_ids + im_end + nl_tokens + input_id += _input_id + if role == '<|im_start|>human': + # user 不需要拟合 + _target = im_start + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + im_end + nl_tokens + elif role == '<|im_start|>assistant': + # assistant 需要拟合 + _target = im_start + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \ + _input_id[len(tokenizer(role).input_ids)+1:-2] + im_end + nl_tokens + else: + print(role) + raise NotImplementedError + target += _target + assert len(input_id) == len(target) + # 最后进行 PAD + input_id += [tokenizer.pad_token_id] * (max_len - len(input_id)) + target += [IGNORE_TOKEN_ID] * (max_len - len(target)) + input_ids.append(input_id[:max_len]) + targets.append(target[:max_len]) + # print(input_ids) + input_ids = torch.tensor(input_ids) + targets = torch.tensor(targets) + + return dict( + input_ids=input_ids, + labels=targets, + attention_mask=input_ids.ne(tokenizer.pad_token_id), + ) +# 自定义一个 Dataset +from typing import Dict + +class SupervisedDataset(Dataset): + + def __init__(self, raw_data, tokenizer, max_len: int): + super(SupervisedDataset, self).__init__() + # 加载并预处理数据 + sources = [example["conversations"] for example in raw_data] + data_dict = preprocess(sources, tokenizer, max_len) + + self.input_ids = data_dict["input_ids"] + self.labels = data_dict["labels"] + self.attention_mask = data_dict["attention_mask"] + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return dict( + input_ids=self.input_ids[i], + labels=self.labels[i], + attention_mask=self.attention_mask[i], + ) + + +def main(): + + # 加载脚本参数 + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # 初始化 WandB + wandb.init(project="sft", name="qwen-1.5b") + + # 设置日志 + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + # 将日志级别设置为 INFO + transformers.utils.logging.set_verbosity_info() + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # 训练整体情况记录 + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # 检查 checkpoint + last_checkpoint = None + if os.path.isdir(training_args.output_dir): + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"输出路径 ({training_args.output_dir}) 非空 " + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"从 {last_checkpoint}恢复训练" + ) + + # 设置随机数种子. + set_seed(training_args.seed) + + # 初始化模型 + logger.warning("加载预训练模型") + logger.info(f"模型参数地址:{model_args.model_name_or_path}") + model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,trust_remote_code=True) + n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) + logger.info(f"继承一个预训练模型 - Total size={n_params/2**20:.2f}M params") + + # 初始化 Tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) + logger.info("完成 tokenzier 加载") + + # 加载微调数据 + with open(data_args.train_files) as f: + lst = [json.loads(line) for line in f.readlines()[:10000]] + logger.info("完成训练集加载") + logger.info(f"训练集地址:{data_args.train_files}") + logger.info(f'训练样本总数:{len(lst)}') + # logger.info(f"训练集采样:{ds["train"][0]}") + + train_dataset = SupervisedDataset(lst, tokenizer=tokenizer, max_len=2048) + + logger.info("初始化 Trainer") + trainer = Trainer( + model=model, + args=training_args, + train_dataset= IterableWrapper(train_dataset), + tokenizer=tokenizer + ) + + # 从 checkpoint 加载 + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + logger.info("开始训练") + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docs/chapter6/code/finetune.sh b/docs/chapter6/code/finetune.sh new file mode 100644 index 0000000..20cfde4 --- /dev/null +++ b/docs/chapter6/code/finetune.sh @@ -0,0 +1,27 @@ +CUDA_VISIBLE_DEVICES=0,1 + +deepspeed finetune.py \ + --model_name_or_path autodl-tmp/qwen-1.5b \ + --train_files autodl-tmp/dataset/sft_data/BelleGroup/train_3.5M_CN.json \ + --per_device_train_batch_size 16 \ + --gradient_accumulation_steps 4 \ + --do_train \ + --output_dir autodl-tmp/output/sft \ + --evaluation_strategy no \ + --learning_rate 1e-4 \ + --num_train_epochs 3 \ + --warmup_steps 200 \ + --logging_dir autodl-tmp/output/sft/logs \ + --logging_strategy steps \ + --logging_steps 5 \ + --save_strategy steps \ + --save_steps 100 \ + --save_total_limit 1 \ + --seed 12 \ + --block_size 2048 \ + --bf16 \ + --gradient_checkpointing \ + --deepspeed ./ds_config_zero2.json \ + --report_to wandb + + # --resume_from_checkpoint ${output_model}/checkpoint-20400 \ \ No newline at end of file diff --git a/docs/chapter6/code/pretrain.ipynb b/docs/chapter6/code/pretrain.ipynb new file mode 100644 index 0000000..dee2c6e --- /dev/null +++ b/docs/chapter6/code/pretrain.ipynb @@ -0,0 +1,882 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bb9102c3-5b8d-4295-8f29-113b35ec5679", + "metadata": {}, + "source": [ + "# 一、LLM 预训练" + ] + }, + { + "cell_type": "markdown", + "id": "8557a6a6-294a-49c3-a8f6-e58bc3bf443d", + "metadata": {}, + "source": [ + "1.1 初始化 LLM" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "25f1fad8-772c-474e-a43e-77623106485d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2Config {\n", + " \"_name_or_path\": \"autodl-tmp/qwen-1.5b\",\n", + " \"architectures\": [\n", + " \"Qwen2ForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 151643,\n", + " \"eos_token_id\": 151643,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 1536,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 8960,\n", + " \"max_position_embeddings\": 131072,\n", + " \"max_window_layers\": 28,\n", + " \"model_type\": \"qwen2\",\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 28,\n", + " \"num_key_value_heads\": 2,\n", + " \"rms_norm_eps\": 1e-06,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": true,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.44.2\",\n", + " \"use_cache\": true,\n", + " \"use_mrope\": false,\n", + " \"use_sliding_window\": false,\n", + " \"vocab_size\": 151936\n", + "}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 加载定义好的模型参数-此处以 Qwen-2.5-1.5B 为例\n", + "# 使用 transforemrs 的 Config 类进行加载\n", + "from transformers import AutoConfig\n", + "\n", + "model_path = \"autodl-tmp/qwen-1.5b\"\n", + "config = AutoConfig.from_pretrained(model_path)\n", + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "82b075a1-4fe9-4abb-b5b4-769d1c1a7156", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training new model from scratch - Total size=1472.20M params\n" + ] + } + ], + "source": [ + "# 使用该配置生成一个定义好的模型\n", + "from transformers import AutoModelForCausalLM\n", + "\n", + "model = AutoModelForCausalLM.from_config(config,trust_remote_code=True)\n", + "model.to(\"cuda\")\n", + "n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())\n", + "print(f\"Training new model from scratch - Total size={n_params/2**20:.2f}M params\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e05ea707-23db-4e67-8b7d-e57d019887dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2ForCausalLM(\n", + " (model): Qwen2Model(\n", + " (embed_tokens): Embedding(151936, 1536)\n", + " (layers): ModuleList(\n", + " (0-27): 28 x Qwen2DecoderLayer(\n", + " (self_attn): Qwen2SdpaAttention(\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (k_proj): Linear(in_features=1536, out_features=256, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=256, bias=True)\n", + " (o_proj): Linear(in_features=1536, out_features=1536, bias=False)\n", + " (rotary_emb): Qwen2RotaryEmbedding()\n", + " )\n", + " (mlp): Qwen2MLP(\n", + " (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)\n", + " (up_proj): Linear(in_features=1536, out_features=8960, bias=False)\n", + " (down_proj): Linear(in_features=8960, out_features=1536, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n", + " (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)\n", + " )\n", + " )\n", + " (norm): Qwen2RMSNorm((1536,), eps=1e-06)\n", + " )\n", + " (lm_head): Linear(in_features=1536, out_features=151936, bias=False)\n", + ")" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 看一下模型\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3408137b-eb50-4119-be1c-7a4ff951ab24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2TokenizerFast(name_or_path='autodl-tmp/qwen-1.5b', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", + "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151657: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151658: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 加载一个预训练好的 tokenizer\n", + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path)\n", + "tokenizer" + ] + }, + { + "cell_type": "markdown", + "id": "221a0fe2-a244-4e73-b82c-6da255d710dd", + "metadata": {}, + "source": [ + "1.2 预训练数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "936261a6-94cf-4cf3-842c-d3f1fde47a71", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "66ae9baa159b424ea5f5bc8d05b9b567", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Generating train split: 0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 加载预训练数据\n", + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset('json', data_files='autodl-tmp/dataset/pretrain_data/mobvoi_seq_monkey_general_open_corpus_small.jsonl')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "068edbb9-cb3c-49b1-aaf9-67b97ddfc58c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'text': '在查处虚开增值税专用发票案件中,常常涉及进项留抵税额和税款损失的认定和处理。在计算税款损失时,要不要将进项留抵税额包括在内?\\n对此,实务中存在意见分歧。\\n有人主张归并,即计算税款损失时包括进项留抵税额;\\n有人主张剥离,即计算税款损失时剔除进项留抵税额。分析这个问题,需要确定进项留抵税额与税款损失之间是什么关系。\\n理清这二者之间的关系,首先需要了解增值税的概念和其抵扣机制。增值税是以商品(货物、服务等)在流转过程中产生的增值额作为计税依据而征收的一种流转税。为避免重复征税,在增值税中存在抵扣链条机制。\\n一般而言,交易上游企业缴纳的税额,交易下游企业可以对相应的税额进行抵扣。\\n对增值税一般纳税人来说,其购进货物、服务等取得增值税专用发票,发票上的税额是进项税额。\\n其出售货物、服务等,向购买方开具增值税专用发票,发票的税额是销项税额。\\n一般情况下,销项税额减去进项税额的金额是应纳税额,企业根据应纳税额按期申报纳税。\\n其次需要了解进项留抵税额的概念及产生原因。\\n在计算销项税额和进项税额的差额时,有时会出现负数,即当期进项税额大于当期销项税额。这个差额在当期未实现抵扣,为进项留抵税额,在以后纳税人有销项税额时再进行抵扣。\\n企业产生进项留抵税额的主要原因是其进项税额和销项税额时间上的不一致。\\n例如,企业前期集中采购货物和服务,投资大,销项税率低于进项税率等。\\n从税款抵扣的角度看,进项留抵税额只是购进的这部分进项税额参与到增值税应纳税额的计算过程中,但是其对应的进项税额抵扣还未真正实现,一般要等到其未来有相应的销项税额时,才能真正实现进项税额抵扣。\\n可见,进项留抵税额处于不确定状态,能否抵扣受到很多因素影响,例如企业经营中断,没有销项税额,这时进项留抵税额就无法实现抵扣。但如果企业按照税收政策规定申请进项留抵退税,进项税额抵扣就随之实现。\\n最后需要了解税款损失的概念。\\n税款损失,通常是指因虚开增值税专用发票,导致国家税款被骗或者流失的金额。关于税款损失,实务中有多种表述。\\n例如,北京大学法学院教授陈兴良曾谈到虚开行为本身不会造成国家税款损失,只有利用发票抵扣时才会造成国家税款损失。刘兵等编著的《虚开增值税专用发票案例司法观点和案例解析》一书中提到:“给国家税款造成损失的数额,实际上就是被骗取的国家税款在侦查终结以前无法追回的部分。”\\n赵清海与王家欣合著的《增值税专用发票虚开的判定与预防》一书中提到:“司法实践中,受票方用虚开的增值税专用发票予以抵扣的税款,从而导致受票方应纳税额的减少是法院所认定的国家税款流失的金额。”\\n从这些表述可见,税款损失应该是实际造成的损失,不应包括不确定的部分——进项留抵税额,进项留抵税额与税款损失之间不能直接画等号。\\n综上分析,进项留抵税额,只是使国家税款处于可能被抵扣的状态,还没有真正造成国家税款流失,一般情况下应将其从税款损失中剥离,特殊条件下将其归并入税款损失。\\n例如,当纳税人造假按照税收政策规定申请进项留抵税额退税后,有关税款损失将会从危险状态转化成危害结果,这时候要将有关进项留抵税额并入税款损失。\\n所以,在虚开增值税专用发票案件中,一般情况下,如果以纳税人的进项税额作为税款损失的计算基数,在对其进行行政处罚或刑事处罚时,应把进项留抵税额从税款损失中剔除,但纳税人申请进项留抵退税的除外。这样处理,把处罚与危害结果相对应,体现行政处罚法的过罚相当原则和刑法的罚当其罪原则。'}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[\"train\"][0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ef372a1f-e82f-4f5d-8495-f21f06b35635", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['text']\n" + ] + } + ], + "source": [ + "# 查看特征\n", + "column_names = list(ds[\"train\"].features)\n", + "print(column_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1fa637f5-3b23-4a33-b19b-4c90d1815c39", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "316489431b9e494eb8358a0d0048096f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running tokenizer on dataset (num_proc=10): 0%| | 0/100001 [00:00= block_size:\n", + " total_length = (total_length // block_size) * block_size\n", + " # Split by chunks of max_len.\n", + " result = {\n", + " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", + " for k, t in concatenated_examples.items()\n", + " }\n", + " # print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) \n", + " print(\"group texts input examples length%d after_group size%d\"%(len(examples['input_ids']),len(result[\"input_ids\"])))\n", + " result[\"labels\"] = result[\"input_ids\"].copy()\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "38428a53-6ba6-429f-8c4b-0985579e726b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ae53ab8aaa0043418c2b7eb86f3d462b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Grouping texts in chunks of 2048 (num_proc=10): 0%| | 0/100001 [00:00\n", + " \n", + " \n", + " [ 101/1751 29:31 < 8:12:11, 0.06 it/s, Epoch 0.06/1]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
1010.987700
209.160700
308.352700
408.159800
508.042500
608.014400
707.986700
807.951800
907.875500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "RuntimeError", + "evalue": "[enforce fail at inline_container.cc:603] . unexpected pos 6546708864 vs 6546708760", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/torch/serialization.py:652\u001b[0m, in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 651\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _open_zipfile_writer(f) \u001b[38;5;28;01mas\u001b[39;00m opened_zipfile:\n\u001b[0;32m--> 652\u001b[0m \u001b[43m_save\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_protocol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_disable_byteorder_record\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/torch/serialization.py:886\u001b[0m, in \u001b[0;36m_save\u001b[0;34m(obj, zip_file, pickle_module, pickle_protocol, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 885\u001b[0m num_bytes \u001b[38;5;241m=\u001b[39m storage\u001b[38;5;241m.\u001b[39mnbytes()\n\u001b[0;32m--> 886\u001b[0m \u001b[43mzip_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_record\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_bytes\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:778] . PytorchStreamWriter failed writing file data/401: file write failed", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[15], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstart train\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 2\u001b[0m train_result \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1936\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1939\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1940\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1942\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1943\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2356\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2353\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mepoch \u001b[38;5;241m=\u001b[39m epoch \u001b[38;5;241m+\u001b[39m (step \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m steps_skipped) \u001b[38;5;241m/\u001b[39m steps_in_epoch\n\u001b[1;32m 2354\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[0;32m-> 2356\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_log_save_evaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtr_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_norm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2357\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2358\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_substep_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2807\u001b[0m, in \u001b[0;36mTrainer._maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2804\u001b[0m metrics \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_evaluate(trial, ignore_keys_for_eval)\n\u001b[1;32m 2806\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol\u001b[38;5;241m.\u001b[39mshould_save:\n\u001b[0;32m-> 2807\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_checkpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2808\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_save(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:2890\u001b[0m, in \u001b[0;36mTrainer._save_checkpoint\u001b[0;34m(self, model, trial, metrics)\u001b[0m\n\u001b[1;32m 2886\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_model(output_dir, _internal_call\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 2888\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39msave_only_model:\n\u001b[1;32m 2889\u001b[0m \u001b[38;5;66;03m# Save optimizer and scheduler\u001b[39;00m\n\u001b[0;32m-> 2890\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_optimizer_and_scheduler\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2891\u001b[0m \u001b[38;5;66;03m# Save RNG state\u001b[39;00m\n\u001b[1;32m 2892\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_save_rng_state(output_dir)\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/transformers/trainer.py:3006\u001b[0m, in \u001b[0;36mTrainer._save_optimizer_and_scheduler\u001b[0;34m(self, output_dir)\u001b[0m\n\u001b[1;32m 3001\u001b[0m save_fsdp_optimizer(\n\u001b[1;32m 3002\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfsdp_plugin, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptimizer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, output_dir\n\u001b[1;32m 3003\u001b[0m )\n\u001b[1;32m 3004\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n\u001b[1;32m 3005\u001b[0m \u001b[38;5;66;03m# deepspeed.save_checkpoint above saves model/optim/sched\u001b[39;00m\n\u001b[0;32m-> 3006\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstate_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mOPTIMIZER_NAME\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3008\u001b[0m \u001b[38;5;66;03m# Save SCHEDULER & SCALER\u001b[39;00m\n\u001b[1;32m 3009\u001b[0m is_deepspeed_custom_scheduler \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_deepspeed_enabled \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\n\u001b[1;32m 3010\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlr_scheduler, DeepSpeedSchedulerWrapper\n\u001b[1;32m 3011\u001b[0m )\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/torch/serialization.py:651\u001b[0m, in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 648\u001b[0m _check_save_filelike(f)\n\u001b[1;32m 650\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _use_new_zipfile_serialization:\n\u001b[0;32m--> 651\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _open_zipfile_writer(f) \u001b[38;5;28;01mas\u001b[39;00m opened_zipfile:\n\u001b[1;32m 652\u001b[0m _save(obj, opened_zipfile, pickle_module, pickle_protocol, _disable_byteorder_record)\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/lib/python3.10/site-packages/torch/serialization.py:499\u001b[0m, in \u001b[0;36m_open_zipfile_writer_file.__exit__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 498\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 499\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfile_like\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_end_of_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_stream \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 501\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_stream\u001b[38;5;241m.\u001b[39mclose()\n", + "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:603] . unexpected pos 6546708864 vs 6546708760" + ] + } + ], + "source": [ + "print('start train')\n", + "train_result = trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "a1ed2cd9-7169-4376-a26c-053918074761", + "metadata": {}, + "source": [ + "# 二、模型 SFT" + ] + }, + { + "cell_type": "markdown", + "id": "1bb6e02b-c04c-45a4-b36c-904f9fedf61e", + "metadata": {}, + "source": [ + "2.1 处理指令数据" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0d7cd012-fa2d-4c21-b6a5-c3830d12f59b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'conversations': [{'from': 'human',\n", + " 'value': '针对健身房的新手,设计一套适合他们的健身器械使用指南,包括安全应用、正确姿势等方面。'},\n", + " {'from': 'assistant',\n", + " 'value': '健身器械使用指南\\n1. 开始前,请先进行热身运动。这会帮助你的身体适应运动,并减少受伤的风险。\\n2. 在使用健身器械前,确保你已经了解了其使用方法。请阅读说明书或咨询教练以获得正确的使用技巧。\\n3. 谨防过度使用或过度挑战你的身体。 如果你觉得有些动作太难或太重,请添加锻炼计划,以逐步提高动作难度。\\n4. 使用合适的装备。 确保你拥有合适的运动鞋和舒适的运动服。 不要在裸露的脚或短裤上进行重量训练。\\n5. 在健身器械上使用安全装置。 这些通常用于保护你的身体免受不当操作造成的损伤。 例如,重量训练中,你需要使用杠铃和负重时,一定要使用卡子来防止重量滑落。\\n6. 注意正确的姿势。 如果你的姿势是错误的,那么你的身体很容易被伤害到,你也可能无法获得最佳的锻炼效果。 至关重要的是,保持直立的身体,保持头部和颈部的稳定,并使用合适的重量。\\n7. 保持合理的呼吸方式。 无论何时进行训练,都必须保持正常呼吸。 当你需要用力时,呼气; 当你放松时,吸气。\\n8. 安全存放器械。 在使用健身器械后,你需要把它们归还给适当的位置,以便其他人可以使用它们。\\n总之,健身器械的正确使用是关键之一,如果不健康和不安全,它们将无法帮助您达到您所需的健康成果。 选择适当的训练计划,并为训练提供足够的时间,以备逐渐适应新方法。 对于任何问题,请向教练咨询。'}],\n", + " 'id': '66182880'}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "\n", + "with open(\"autodl-tmp/dataset/sft_data/BelleGroup/train_3.5M_CN.json\") as f:\n", + " lst = [json.loads(line) for line in f.readlines()]\n", + "\n", + "lst[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2fc8c599-89e9-4c35-a011-d2e52a1a4d9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Qwen2TokenizerFast(name_or_path='autodl-tmp/qwen-1.5b', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", + "\t151643: AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151644: AddedToken(\"<|im_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151645: AddedToken(\"<|im_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151646: AddedToken(\"<|object_ref_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151647: AddedToken(\"<|object_ref_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151648: AddedToken(\"<|box_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151649: AddedToken(\"<|box_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151650: AddedToken(\"<|quad_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151651: AddedToken(\"<|quad_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151652: AddedToken(\"<|vision_start|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151653: AddedToken(\"<|vision_end|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151654: AddedToken(\"<|vision_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151655: AddedToken(\"<|image_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151656: AddedToken(\"<|video_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t151657: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151658: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151659: AddedToken(\"<|fim_prefix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151660: AddedToken(\"<|fim_middle|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151661: AddedToken(\"<|fim_suffix|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151662: AddedToken(\"<|fim_pad|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151663: AddedToken(\"<|repo_name|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "\t151664: AddedToken(\"<|file_sep|>\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),\n", + "}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 加载一个预训练好的 tokenizer\n", + "from transformers import AutoTokenizer\n", + "\n", + "model_path = \"autodl-tmp/qwen-1.5b\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path)\n", + "tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46730b29-41c0-4295-81f2-913d069b4669", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from tqdm import tqdm\n", + "\n", + "# 指令文本处理\n", + "# 参考:https://github.com/QwenLM/Qwen/blob/main/finetune.py\n", + "def preprocess(sources, tokenizer, max_len, system_message: str = \"You are a helpful assistant.\"):\n", + " # prompt 模板\n", + " roles = {\"human\": \"<|im_start|>human\", \"assistant\": \"<|im_start|>assistant\"}\n", + "\n", + " # 不同的 tokenizer 需要特别定义\n", + " # BOS\n", + " im_start = tokenizer(\"<|im_start|>\").input_ids\n", + " # EOS\n", + " im_end = tokenizer(\"<|im_end|>\").input_ids\n", + " # PAD\n", + " IGNORE_TOKEN_ID = tokenizer.pad_token_id\n", + " # 换行符\n", + " nl_tokens = tokenizer('\\n').input_ids\n", + " # 角色标识符\n", + " _system = tokenizer('system').input_ids + nl_tokens\n", + " _user = tokenizer('human').input_ids + nl_tokens\n", + " _assistant = tokenizer('assistant').input_ids + nl_tokens\n", + "\n", + " # 拼接多轮对话\n", + " input_ids, targets = [], []\n", + " for i in tqdm(range(len(sources))):\n", + " source = sources[i]\n", + " # 从 user 开始\n", + " if source[0][\"from\"] != \"human\":\n", + " source = source[1:]\n", + " # 分别是输入和输出\n", + " input_id, target = [], []\n", + " # system: 【BOS】system\\nYou are a helpful assistant.【EOS】\\n\n", + " system = im_start + _system + tokenizer(system_message).input_ids + im_end + nl_tokens\n", + " input_id += system\n", + " # system 不需要拟合\n", + " target += im_start + [IGNORE_TOKEN_ID] * (len(system)-3) + im_end + nl_tokens\n", + " assert len(input_id) == len(target)\n", + " # 依次拼接\n", + " for j, sentence in enumerate(source):\n", + " role = roles[sentence[\"from\"]]\n", + " # user:<|im_start|>human\\ninstruction【EOS】\\n\n", + " # assistant:<|im_start|>assistant\\nresponse【EOS】\\n\n", + " _input_id = tokenizer(role).input_ids + nl_tokens + \\\n", + " tokenizer(sentence[\"value\"]).input_ids + im_end + nl_tokens\n", + " input_id += _input_id\n", + " if role == '<|im_start|>human':\n", + " # user 不需要拟合\n", + " _target = im_start + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + im_end + nl_tokens\n", + " elif role == '<|im_start|>assistant':\n", + " # assistant 需要拟合\n", + " _target = im_start + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \\\n", + " _input_id[len(tokenizer(role).input_ids)+1:-2] + im_end + nl_tokens\n", + " else:\n", + " print(role)\n", + " raise NotImplementedError\n", + " target += _target\n", + " assert len(input_id) == len(target)\n", + " # 最后进行 PAD\n", + " input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))\n", + " target += [IGNORE_TOKEN_ID] * (max_len - len(target))\n", + " input_ids.append(input_id[:max_len])\n", + " targets.append(target[:max_len])\n", + " # print(input_ids)\n", + " input_ids = torch.tensor(input_ids)\n", + " targets = torch.tensor(targets)\n", + "\n", + " return dict(\n", + " input_ids=input_ids,\n", + " labels=targets,\n", + " attention_mask=input_ids.ne(tokenizer.pad_token_id),\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "7b3576cb-04d7-448a-9bd1-07cb7b344e6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': tensor([[151644, 8948, 198, ..., 151643, 151643, 151643],\n", + " [151644, 8948, 198, ..., 151643, 151643, 151643]]),\n", + " 'labels': tensor([[151644, 151643, 151643, ..., 151643, 151643, 151643],\n", + " [151644, 151643, 151643, ..., 151643, 151643, 151643]]),\n", + " 'attention_mask': tensor([[ True, True, True, ..., False, False, False],\n", + " [ True, True, True, ..., False, False, False]])}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 测试一下\n", + "preprocess([lst[0][\"conversations\"],lst[1][\"conversations\"]], tokenizer, 1024)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "63e01dcf-4de4-4470-97dd-3317ef1aa00b", + "metadata": {}, + "outputs": [], + "source": [ + "# 自定义一个 Dataset\n", + "from torch.utils.data import Dataset\n", + "from typing import Dict\n", + "\n", + "class SupervisedDataset(Dataset):\n", + "\n", + " def __init__(self, raw_data, tokenizer, max_len: int):\n", + " super(SupervisedDataset, self).__init__()\n", + " # 加载并预处理数据\n", + " sources = [example[\"conversations\"] for example in raw_data[:10000]]\n", + " data_dict = preprocess(sources, tokenizer, max_len)\n", + "\n", + " self.input_ids = data_dict[\"input_ids\"]\n", + " self.labels = data_dict[\"labels\"]\n", + " self.attention_mask = data_dict[\"attention_mask\"]\n", + "\n", + " def __len__(self):\n", + " return len(self.input_ids)\n", + "\n", + " def __getitem__(self, i) -> Dict[str, torch.Tensor]:\n", + " return dict(\n", + " input_ids=self.input_ids[i],\n", + " labels=self.labels[i],\n", + " attention_mask=self.attention_mask[i],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "934316d3-098f-4889-9cb0-d234a630b194", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 10000/10000 [00:08<00:00, 1235.98it/s]\n" + ] + } + ], + "source": [ + "train_ds = SupervisedDataset(lst, tokenizer=tokenizer, max_len=2048)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/chapter6/code/pretrain.py b/docs/chapter6/code/pretrain.py index 7a60954..04b9257 100644 --- a/docs/chapter6/code/pretrain.py +++ b/docs/chapter6/code/pretrain.py @@ -1,15 +1,5 @@ -#!/usr/bin/env python -# -*- encoding: utf-8 -*- ''' -@File : pretrain.py -@Time : 2025/04/10 16:43:43 -@Author : Logan Zou -@Version : 1.0 -@Contact : loganzou0421@163.com -@License : (C)Copyright 2017-2018, Liugroup-NLPR-CASIA -@Desc : 基于 Transformers 的 LLM 预训练脚本 - -注:参考仓库:https://github.com/LlamaFamily/Llama-Chinese +预训练脚本 ''' import logging @@ -17,225 +7,106 @@ import math import os import sys from dataclasses import dataclass, field -from torchdata.datapipes.iter import IterDataPipe, IterableWrapper +from torchdata.datapipes.iter import IterableWrapper from itertools import chain import deepspeed from typing import Optional,List + import datasets import pandas as pd -import evaluate import torch from datasets import load_dataset -from datasets.combine import interleave_datasets import transformers -from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR from transformers import ( - CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, AutoConfig, AutoModelForCausalLM, AutoTokenizer, - TrainerCallback, - TrainerState, - TrainerControl, HfArgumentParser, Trainer, TrainingArguments, default_data_collator, - is_torch_tpu_available, set_seed, ) import datetime from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry -from transformers.utils.versions import require_version -from datasets import interleave_datasets +import wandb logger = logging.getLogger(__name__) -MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) -MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) - +# 超参类 @dataclass class ModelArguments: """ - Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + 关于模型的参数 """ model_name_or_path: Optional[str] = field( default=None, metadata={ "help": ( - "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." - ) - }, - ) - model_type: Optional[str] = field( - default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, - ) - config_overrides: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Override some existing default config settings when a model is trained from scratch. Example: " - "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + "后训练使用,为预训练模型参数地址" ) }, ) config_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + default=None, metadata={"help": "预训练使用,Config 文件地址"} ) tokenizer_name: Optional[str] = field( - default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} - ) - cache_dir: Optional[str] = field( - default=None, - metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, - ) - use_fast_tokenizer: bool = field( - default=True, - metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - use_auth_token: bool = field( - default=False, - metadata={ - "help": ( - "Will use the token generated when running `huggingface-cli login` (necessary to use this script " - "with private models)." - ) - }, + default=None, metadata={"help": "预训练 Tokenizer 地址"} ) torch_dtype: Optional[str] = field( default=None, metadata={ "help": ( - "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " - "dtype will be automatically derived from the model's weights." + "模型训练使用的数据类型,推荐 bfloat16" ), "choices": ["auto", "bfloat16", "float16", "float32"], }, ) - def __post_init__(self): - if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): - raise ValueError( - "--config_overrides can't be used in combination with --config_name or --model_name_or_path" - ) - @dataclass class DataTrainingArguments: """ - Arguments pertaining to what data we are going to input our model for training and eval. + 关于训练的参数 """ - dataset_name: Optional[str] = field( - default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} - ) - dataset_config_name: Optional[str] = field( - default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} - ) - train_files: Optional[List[str]] = field(default=None, metadata={"help": "The input training data file (a text file)."}) - validation_files: Optional[List[str]] = field( - default=None, - metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, - ) - max_train_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set." - ) - }, - ) - max_eval_samples: Optional[int] = field( - default=None, - metadata={ - "help": ( - "For debugging purposes or quicker training, truncate the number of evaluation examples to this " - "value if set." - ) - }, - ) - streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) + train_files: Optional[List[str]] = field(default=None, metadata={"help": "训练数据路径"}) block_size: Optional[int] = field( default=None, metadata={ "help": ( - "Optional input sequence length after tokenization. " - "The training dataset will be truncated in block of this size for training. " - "Default to the model max input length for single sentence inputs (take into account special tokens)." + "设置的文本块长度" ) }, ) - overwrite_cache: bool = field( - default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} - ) - validation_split_percentage: Optional[int] = field( - default=5, - metadata={ - "help": "The percentage of the train set used as validation set in case there's no validation split" - }, - ) preprocessing_num_workers: Optional[int] = field( default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, - ) - keep_linebreaks: bool = field( - default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + metadata={"help": "预处理使用线程数."}, ) - def __post_init__(self): - if self.streaming: - require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") - - if self.dataset_name is None and self.train_files is None and self.validation_files is None: - raise ValueError("Need either a dataset name or a training/validation file.") - else: - if self.train_files is not None: - extension = self.train_files[0].split(".")[-1] - assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." - if self.validation_files is not None: - extension = self.validation_files[0].split(".")[-1] - assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. + # 加载脚本参数 parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args = parser.parse_args_into_dataclasses() - # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The - # information sent is the one passed as arguments along with your Python/PyTorch versions. - send_example_telemetry("run_clm", model_args, data_args) - - # Setup logging + # 初始化 WandB + wandb.init(project="pretrain", name="from_scrach") + + # 设置日志 logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) - if training_args.should_log: - # The default of training_args.log_level is passive, so we set log level at info here to have that default. - transformers.utils.logging.set_verbosity_info() - + # 将日志级别设置为 INFO + transformers.utils.logging.set_verbosity_info() log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) @@ -243,359 +114,144 @@ def main(): transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() - # Log on each process the small summary: + # 训练整体情况记录 logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") - # Detecting last checkpoint. + # 检查 checkpoint last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + if os.path.isdir(training_args.output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." + f"输出路径 ({training_args.output_dir}) 非空 " ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + f"从 {last_checkpoint}恢复训练" ) - # Set seed before initializing model. + # 设置随机数种子. set_seed(training_args.seed) - # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called - # 'text' is found. You can easily tweak this behavior (see below). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - if True: - data_files = {} - dataset_args = {} - if data_args.train_files is not None: - - print(data_args.train_files) - data_files["train"] = data_args.train_files - print('训练文件总个数',len(data_args.train_files)) - if data_args.validation_files is not None: - data_files["validation"] = data_args.validation_files - extension = ( - data_files["train"][0].split(".")[-1] - if data_files["train"] is not None - else data_args.validation_files.split(".")[-1] - ) - if extension == "txt": - extension = "text" - dataset_args["keep_linebreaks"] = data_args.keep_linebreaks - - - raw_datasets = load_dataset( - extension, - data_files=data_files, - streaming=data_args.streaming, - cache_dir=os.path.join(training_args.output_dir,'dataset_cache'), - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - if data_args.streaming: - raw_datasets = raw_datasets.shuffle(seed=training_args.seed, buffer_size=1000000) - # If no validation data is there, validation_split_percentage will be used to divide the dataset. - if "validation" not in raw_datasets.keys(): - raw_datasets["validation"] = load_dataset( - extension, - data_files=data_files, - split=f"train[:{data_args.validation_split_percentage}%]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - raw_datasets["train"] = load_dataset( - extension, - data_files=data_files, - split=f"train[{data_args.validation_split_percentage}%:]", - cache_dir=model_args.cache_dir, - use_auth_token=True if model_args.use_auth_token else None, - **dataset_args, - ) - - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - - config_kwargs = { - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.config_name: - config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) - elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) - else: - config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") - if model_args.config_overrides is not None: - logger.info(f"Overriding config: {model_args.config_overrides}") - config.update_from_string(model_args.config_overrides) - logger.info(f"New config: {config}") - - print(training_args.local_rank,'start load tokenizer') - tokenizer_kwargs = { - "cache_dir": model_args.cache_dir, - "use_fast": model_args.use_fast_tokenizer, - "revision": model_args.model_revision, - "use_auth_token": True if model_args.use_auth_token else None, - } - if model_args.tokenizer_name: - tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) - elif model_args.model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) - else: - raise ValueError( - "You are instantiating a new tokenizer from scratch. This is not supported by this script." - "You can do it from another script, save it, and load it from here, using --tokenizer_name." - ) - print(training_args.local_rank,'end load tokenizer') - print(training_args.local_rank,'start load model') - if model_args.model_name_or_path: - torch_dtype = ( - model_args.torch_dtype - if model_args.torch_dtype in ["auto", None] - else getattr(torch, model_args.torch_dtype) - ) - model = AutoModelForCausalLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - trust_remote_code=True, - use_flash_attention_2=True, - use_auth_token=True if model_args.use_auth_token else None, - ) - else: + # 初始化模型 + if model_args.config_name is not None: + # from scrach + config = AutoConfig.from_pretrained(model_args.config_name) + logger.warning("你正在从零初始化一个模型") + logger.info(f"模型参数配置地址:{model_args.config_name}") + logger.info(f"模型参数:{config}") model = AutoModelForCausalLM.from_config(config,trust_remote_code=True) n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") - print(training_args.local_rank,'end load model') - # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch - # on a small vocab and want a smaller embedding size, remove this test. - embedding_size = model.get_input_embeddings().weight.shape[0] - if len(tokenizer) > embedding_size: - model.resize_token_embeddings(len(tokenizer)) - # Preprocessing the datasets. - # First we tokenize all the texts. - if training_args.do_train: - if data_args.streaming: - dataset_head = raw_datasets["train"].take(3) - print(list(dataset_head)) - column_names = list(list(dataset_head)[0].keys()) - else: - column_names = list(raw_datasets["train"].features) + logger.info(f"预训练一个新模型 - Total size={n_params/2**20:.2f}M params") + elif model_args.model_name_or_path is not None: + logger.warning("你正在初始化一个预训练模型") + logger.info(f"模型参数地址:{model_args.model_name_or_path}") + model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,trust_remote_code=True) + n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) + logger.info(f"继承一个预训练模型 - Total size={n_params/2**20:.2f}M params") else: - if data_args.streaming: - dataset_head = raw_datasets["validation"].take(3) - column_names = list(list(dataset_head)[0].keys()) - else: - column_names = list(raw_datasets["validation"].features) - print(column_names) + logger.error("config_name 和 model_name_or_path 不能均为空") + raise ValueError("config_name 和 model_name_or_path 不能均为空") + + # 初始化 Tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name) + logger.info("完成 tokenzier 加载") + logger.info(f"tokenzier 配置地址:{model_args.tokenizer_name}") + + # 加载预训练数据 + ds = load_dataset('json', data_files=data_args.train_files) + logger.info("完成训练集加载") + logger.info(f"训练集地址:{data_args.train_files}") + logger.info(f'训练文件总数:{len(ds["train"])}') + # logger.info(f"训练集采样:{ds["train"][0]}") + + # 文本 tokenize + column_names = list(ds["train"].features) + logger.info('训练集特征:', column_names) text_column_name = "text" if "text" in column_names else column_names[0] - # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function - tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") - + # tokenize 函数 def tokenize_function(examples): - with CaptureLogger(tok_logger) as cl: - output = tokenizer( [ item for item in examples[text_column_name]]) + output = tokenizer([item for item in examples[text_column_name]]) return output + # 仅主进程进行数据预处理 with training_args.main_process_first(desc="dataset map tokenization"): - if not data_args.streaming: - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on dataset", - ) - else: - tokenized_datasets = raw_datasets.map( - tokenize_function, - batched=True, - remove_columns=column_names, - batch_size = 60000, - ) + tokenized_datasets = ds.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=True, + desc="Running tokenizer on dataset" + ) + # 文本切块 if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( - "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" - " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" - " override this default with `--block_size xxx`." + "tokenizer 支持大于 1K 的上下文长度,默认设置为 1K" ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( - f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" - f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + f"设定的块长为 ({data_args.block_size}) ,大于模型的上下文长度" + f"将块长设置为模型上下文长度:{tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) - # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): - # Concatenate all texts. + # 将文本段拼接起来 concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} - # concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + # 计算拼起来的整体长度 total_length = len(concatenated_examples[list(examples.keys())[0]]) - # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can - # customize this part to your needs. + # 如果长度太长,进行分块 if total_length >= block_size: total_length = (total_length // block_size) * block_size - # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() - } - # print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) - logger.info("group texts input examples length%d after_group size%d"%(len(examples['input_ids']),len(result["input_ids"]))) + } result["labels"] = result["input_ids"].copy() return result - # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder - # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower - # to preprocess. - # - # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: - # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map - - with training_args.main_process_first(desc="grouping texts together"): - if not data_args.streaming: - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=not data_args.overwrite_cache, - desc=f"Grouping texts in chunks of {block_size}", - batch_size = 40000, - ) - else: - lm_datasets = tokenized_datasets.map( - group_texts, - batched=True, - batch_size = 60000, - ) - print(training_args.local_rank,'start select train_dataset') - if training_args.do_train: - if "train" not in tokenized_datasets: - raise ValueError("--do_train requires a train dataset") + with training_args.main_process_first(desc="文本分块"): + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=True, + desc=f"文本分块到{block_size}", + batch_size = 40000, + ) + logger.info("完成数据预处理") train_dataset = lm_datasets["train"] - if data_args.max_train_samples is not None and data_args.streaming==False: - max_train_samples = min(len(train_dataset), data_args.max_train_samples) - train_dataset = train_dataset.select(range(max_train_samples)) - print(training_args.local_rank,'end select train_dataset') - - if training_args.do_eval: - if "validation" not in tokenized_datasets: - raise ValueError("--do_eval requires a validation dataset") - print(training_args.local_rank,'start select eval_dataset') - eval_dataset = lm_datasets["validation"] - if data_args.max_eval_samples is not None and data_args.streaming==False : - max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) - eval_dataset = eval_dataset.select(range(max_eval_samples)) - print(training_args.local_rank,'end select eval_dataset') - def preprocess_logits_for_metrics(logits, labels): - if isinstance(logits, tuple): - # Depending on the model and config, logits may contain extra tensors, - # like past_key_values, but logits always come first - logits = logits[0] - return logits.argmax(dim=-1) - print(training_args.local_rank,'start load metric') - metric = evaluate.load("accuracy.py") - print(training_args.local_rank,'end load metric') - - def compute_metrics(eval_preds): - preds, labels = eval_preds - # preds have the same shape as the labels, after the argmax(-1) has been calculated - # by preprocess_logits_for_metrics but we need to shift the labels - labels = labels[:, 1:].reshape(-1) - preds = preds[:, :-1].reshape(-1) - return metric.compute(predictions=preds, references=labels) - print(training_args.local_rank,'Initialize our Trainer') + logger.info("初始化 Trainer") trainer = Trainer( model=model, args=training_args, - train_dataset= IterableWrapper(train_dataset) if training_args.do_train else None, - eval_dataset= IterableWrapper(eval_dataset) if training_args.do_eval else None, + train_dataset= IterableWrapper(train_dataset), tokenizer=tokenizer, - # Data collator will default to DataCollatorWithPadding, so we change it. - data_collator=default_data_collator, - compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, - preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available()else None, - # callbacks=([SavePeftModelCallback] if isinstance(model, PeftModel) else None), + data_collator=default_data_collator ) - - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: + + # 从 checkpoint 加载 + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - print(training_args.local_rank,'start train') - train_result = trainer.train(resume_from_checkpoint=checkpoint) - trainer.save_model() # Saves the tokenizer too for easy upload - - metrics = train_result.metrics - - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - - # Evaluation - if training_args.do_eval: - logger.info("*** Evaluate ***") - - metrics = trainer.evaluate() - - max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) - metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - try: - perplexity = math.exp(metrics["eval_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - - -def _mp_fn(index): - # For xla_spawn (TPUs) - main() - + logger.info("开始训练") + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() if __name__ == "__main__": main() \ No newline at end of file diff --git a/docs/chapter6/code/pretrain.sh b/docs/chapter6/code/pretrain.sh new file mode 100644 index 0000000..07c446f --- /dev/null +++ b/docs/chapter6/code/pretrain.sh @@ -0,0 +1,29 @@ +CUDA_VISIBLE_DEVICES=0,1 + +deepspeed pretrain.py \ + --config_name autodl-tmp/qwen-1.5b \ + --tokenizer_name autodl-tmp/qwen-1.5b \ + --train_files autodl-tmp/dataset/pretrain_data/mobvoi_seq_monkey_general_open_corpus_small.jsonl \ + --per_device_train_batch_size 16 \ + --gradient_accumulation_steps 4 \ + --do_train \ + --output_dir autodl-tmp/output/pretrain \ + --evaluation_strategy no \ + --learning_rate 1e-4 \ + --num_train_epochs 1 \ + --warmup_steps 200 \ + --logging_dir autodl-tmp/output/pretrain/logs \ + --logging_strategy steps \ + --logging_steps 5 \ + --save_strategy steps \ + --save_steps 100 \ + --preprocessing_num_workers 10 \ + --save_total_limit 1 \ + --seed 12 \ + --block_size 2048 \ + --bf16 \ + --gradient_checkpointing \ + --deepspeed ./ds_config_zero2.json \ + --report_to wandb + + # --resume_from_checkpoint ${output_model}/checkpoint-20400 \ \ No newline at end of file diff --git a/docs/chapter6/code/process_dataset.ipynb b/docs/chapter6/code/process_dataset.ipynb new file mode 100644 index 0000000..5a19acf --- /dev/null +++ b/docs/chapter6/code/process_dataset.ipynb @@ -0,0 +1,44 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8c3d4501-a268-418a-b5f6-59078094aab5", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "fw = open(\"autodl-tmp/dataset/pretrain_data/mobvoi_seq_monkey_general_open_corpus_small.jsonl\", \"w\")\n", + "i = 0\n", + "with open(\"autodl-tmp/dataset/pretrain_data/mobvoi_seq_monkey_general_open_corpus.jsonl\", \"r\") as f:\n", + " while i <= 1000000:\n", + " line = f.readline()\n", + " fw.write(line)\n", + " i += 1\n", + "fw.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}