Files
easy-rl/notebooks/NoisyDQN.ipynb
2022-12-04 20:54:36 +08:00

583 lines
120 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. 定义算法\n",
"\n",
"NoisyDQN 是在 DQN 的基础上进行改进,主要就是通过在训练网络的时候加上一些噪声参数,可以用较小的额外计算成本,在强化学习算法上获得更优的结果。\n",
"配置和 DQN 基本一致,只是在模型定义的时候,在模型中加入了一些噪声参数。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.1、 定义模型\n",
"这里使用了一个三层的MLP不同的是其中加入了一些噪声参数就是每个权值weight和偏置bias中都有额外的参数mu和sigma这里仅供参考。"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"class NoisyLinear(nn.Module):\n",
" def __init__(self, input_dim, output_dim, std_init=0.4):\n",
" super(NoisyLinear, self).__init__()\n",
" \n",
" self.input_dim = input_dim\n",
" self.output_dim = output_dim\n",
" self.std_init = std_init\n",
" \n",
" self.weight_mu = nn.Parameter(torch.FloatTensor(output_dim, input_dim))\n",
" self.weight_sigma = nn.Parameter(torch.FloatTensor(output_dim, input_dim))\n",
" self.register_buffer('weight_epsilon', torch.FloatTensor(output_dim, input_dim))\n",
" \n",
" self.bias_mu = nn.Parameter(torch.FloatTensor(output_dim))\n",
" self.bias_sigma = nn.Parameter(torch.FloatTensor(output_dim))\n",
" self.register_buffer('bias_epsilon', torch.FloatTensor(output_dim))\n",
" \n",
" self.reset_parameters()\n",
" self.reset_noise()\n",
" \n",
" def forward(self, x):\n",
" if self.training: \n",
" weight = self.weight_mu + self.weight_sigma.mul(torch.tensor(self.weight_epsilon))\n",
" bias = self.bias_mu + self.bias_sigma.mul(torch.tensor(self.bias_epsilon))\n",
" else:\n",
" weight = self.weight_mu\n",
" bias = self.bias_mu\n",
" \n",
" return F.linear(x, weight, bias)\n",
" \n",
" def reset_parameters(self):\n",
" mu_range = 1 / math.sqrt(self.weight_mu.size(1))\n",
" \n",
" self.weight_mu.data.uniform_(-mu_range, mu_range)\n",
" self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))\n",
" \n",
" self.bias_mu.data.uniform_(-mu_range, mu_range)\n",
" self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))\n",
" \n",
" def reset_noise(self):\n",
" epsilon_in = self._scale_noise(self.input_dim)\n",
" epsilon_out = self._scale_noise(self.output_dim)\n",
" \n",
" self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))\n",
" self.bias_epsilon.copy_(self._scale_noise(self.output_dim))\n",
" \n",
" def _scale_noise(self, size):\n",
" x = torch.randn(size)\n",
" x = x.sign().mul(x.abs().sqrt())\n",
" return x\n",
"\n",
"class NoisyMLP(nn.Module):\n",
" def __init__(self, input_dim,output_dim,hidden_dim=128):\n",
" super(NoisyMLP, self).__init__()\n",
" self.fc1 = nn.Linear(input_dim, hidden_dim)\n",
" self.noisy_fc2 = NoisyLinear(hidden_dim, hidden_dim)\n",
" self.noisy_fc3 = NoisyLinear(hidden_dim, output_dim)\n",
" \n",
" def forward(self, x):\n",
" x = F.relu(self.fc1(x))\n",
" x = F.relu(self.noisy_fc2(x))\n",
" x = self.noisy_fc3(x)\n",
" return x\n",
"\n",
" def reset_noise(self):\n",
" self.noisy_fc2.reset_noise()\n",
" self.noisy_fc3.reset_noise()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2、定义经验回放\n",
"\n",
"这里的经验回放和DQN中保持一致也是具有一定容量只有存储到一定的transition网络才会更新。经验回放的时候一般包涵两个功能或方法一个是push即将一个transition样本按顺序放到经验回放中如果满了就把最开始放进去的样本挤掉另外一个是sample就是随机采样出一个或者若干个具体多少就是batch_size了样本供DQN网络更新。"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"from collections import deque\n",
"import random\n",
"class ReplayBuffer(object):\n",
" def __init__(self, capacity: int) -> None:\n",
" self.capacity = capacity\n",
" self.buffer = deque(maxlen=self.capacity)\n",
" def push(self,transitions):\n",
" ''' 存储transition到经验回放中\n",
" '''\n",
" self.buffer.append(transitions)\n",
" def sample(self, batch_size: int, sequential: bool = False):\n",
" if batch_size > len(self.buffer): # 如果批量大小大于经验回放的容量,则取经验回放的容量\n",
" batch_size = len(self.buffer)\n",
" if sequential: # 顺序采样\n",
" rand = random.randint(0, len(self.buffer) - batch_size)\n",
" batch = [self.buffer[i] for i in range(rand, rand + batch_size)]\n",
" return zip(*batch)\n",
" else: # 随机采样\n",
" batch = random.sample(self.buffer, batch_size)\n",
" return zip(*batch)\n",
" def clear(self):\n",
" ''' 清空经验回放\n",
" '''\n",
" self.buffer.clear()\n",
" def __len__(self):\n",
" ''' 返回当前存储的量\n",
" '''\n",
" return len(self.buffer)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.3、模型算法定义\n",
"\n",
"这里根据前面的噪声MLP搭建智能体agent其中的动作采样和模型更新和DQN基本一致这里不再赘述。"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.optim as optim\n",
"import math\n",
"import numpy as np\n",
"\n",
"class NoisyDQN:\n",
" def __init__(self, model, memory, cfg) -> None:\n",
" self.n_actions = cfg.n_actions \n",
" self.device = torch.device(cfg.device) \n",
" self.gamma = cfg.gamma \n",
" ## e-greedy策略相关参数\n",
" self.sample_count = 0 # 用于epsilon的衰减计数\n",
" self.epsilon = cfg.epsilon_start\n",
" self.epsilon_start = cfg.epsilon_start\n",
" self.epsilon_end = cfg.epsilon_end\n",
" self.epsilon_decay = cfg.epsilon_decay\n",
" self.batch_size = cfg.batch_size\n",
" self.target_update = cfg.target_update\n",
"\n",
" self.device = torch.device(cfg.device) \n",
"\n",
" self.policy_net = model.to(self.device)\n",
" self.target_net = model.to(self.device)\n",
" ## 复制参数到目标网络\n",
" for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): \n",
" target_param.data.copy_(param.data)\n",
" self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)\n",
" self.memory = memory ## 经验回放\n",
" self.update_flag = False\n",
" \n",
" def sample_action(self, state):\n",
" ''' sample action with e-greedy policy \n",
" '''\n",
" self.sample_count += 1\n",
" # epsilon 指数衰减\n",
" self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \\\n",
" math.exp(-1. * self.sample_count / self.epsilon_decay) \n",
" if random.random() > self.epsilon:\n",
" with torch.no_grad():\n",
" state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)\n",
" q_values = self.policy_net(state)\n",
" action = q_values.max(1)[1].item() # 根据Q值选择动作\n",
" else:\n",
" action = random.randrange(self.n_actions)\n",
" return action\n",
"\n",
" @torch.no_grad()\n",
" def predict_action(self, state):\n",
" state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)\n",
" q_value = self.policy_net(state)\n",
" action = q_value.max(1)[1].item()\n",
" return action\n",
" def update(self):\n",
" if len(self.memory) < self.batch_size: # 不满足一个批量时,不更新策略\n",
" return\n",
" else:\n",
" if not self.update_flag:\n",
" print(\"Begin to update!\")\n",
" self.update_flag = True\n",
" # beta = min(1.0, self.beta_start + self.sample_count * (1.0 - self.beta_start) / self.beta_frames)\n",
" state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(\n",
" self.batch_size)\n",
" # state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights_batch, indices = self.memory.sample(self.batch_size, beta) \n",
" state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) \n",
" action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)\n",
" reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1)\n",
" next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)\n",
" done_batch = torch.tensor(done_batch, device=self.device, dtype=torch.float).unsqueeze(1)\n",
" # weights_batch = torch.tensor(weights_batch, device=self.device, dtype=torch.float)\n",
"\n",
" q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True\n",
" next_max_q_value_batch = self.target_net(next_state_batch).max(1)[0].detach().unsqueeze(1) \n",
" expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)\n",
"\n",
" loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to \n",
" # 反向传播\n",
" self.optimizer.zero_grad() \n",
" loss.backward()\n",
" # 梯度截断,防止梯度爆炸\n",
" for param in self.policy_net.parameters(): \n",
" param.grad.data.clamp_(-1, 1)\n",
" self.optimizer.step() \n",
"\n",
" if self.sample_count % self.target_update == 0: # 更新 target_net\n",
" self.target_net.load_state_dict(self.policy_net.state_dict()) \n",
"\n",
" ## 噪声参数重置\n",
" self.policy_net.reset_noise()\n",
" self.target_net.reset_noise()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2、 定义训练"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"def train(cfg, env, agent):\n",
" ''' 训练\n",
" '''\n",
" print(\"开始训练!\")\n",
" rewards = [] # 记录所有回合的奖励\n",
" steps = []\n",
" for i_ep in range(cfg.train_eps):\n",
" ep_reward = 0 # 记录一回合内的奖励\n",
" ep_step = 0\n",
" state = env.reset() # 重置环境,返回初始状态\n",
" for _ in range(cfg.max_steps):\n",
" ep_step += 1\n",
" action = agent.sample_action(state) # 选择动作\n",
" next_state, reward, done, _ = env.step(action) # 更新环境返回transition\n",
" agent.memory.push((state, action, reward,next_state, done)) # 保存transition\n",
" state = next_state # 更新下一个状态\n",
" agent.update() # 更新智能体\n",
" ep_reward += reward # 累加奖励\n",
" if done:\n",
" break\n",
" if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新\n",
" agent.target_net.load_state_dict(agent.policy_net.state_dict())\n",
" steps.append(ep_step)\n",
" rewards.append(ep_reward)\n",
" if (i_ep + 1) % 10 == 0:\n",
" print(f\"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}Epislon{agent.epsilon:.3f}\")\n",
" print(\"完成训练!\")\n",
" env.close()\n",
" return {'rewards':rewards}\n",
"\n",
"def test(cfg, env, agent):\n",
" print(\"开始测试!\")\n",
" rewards = [] # 记录所有回合的奖励\n",
" steps = []\n",
" for i_ep in range(cfg.test_eps):\n",
" ep_reward = 0 # 记录一回合内的奖励\n",
" ep_step = 0\n",
" state = env.reset() # 重置环境,返回初始状态\n",
" for _ in range(cfg.max_steps):\n",
" ep_step+=1\n",
" action = agent.predict_action(state) # 选择动作\n",
" next_state, reward, done, _ = env.step(action) # 更新环境返回transition\n",
" state = next_state # 更新下一个状态\n",
" ep_reward += reward # 累加奖励\n",
" if done:\n",
" break\n",
" steps.append(ep_step)\n",
" rewards.append(ep_reward)\n",
" print(f\"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}\")\n",
" print(\"完成测试\")\n",
" env.close()\n",
" return {'rewards':rewards}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. 定义环境"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import os\n",
"def all_seed(env,seed = 1):\n",
" ''' 万能的seed函数\n",
" '''\n",
" env.seed(seed) # env config\n",
" np.random.seed(seed)\n",
" random.seed(seed)\n",
" torch.manual_seed(seed) # config for CPU\n",
" torch.cuda.manual_seed(seed) # config for GPU\n",
" os.environ['PYTHONHASHSEED'] = str(seed) # config for python scripts\n",
" # config for cudnn\n",
" torch.backends.cudnn.deterministic = True\n",
" torch.backends.cudnn.benchmark = False\n",
" torch.backends.cudnn.enabled = False\n",
"def env_agent_config(cfg):\n",
" env = gym.make(cfg.env_name) # 创建环境\n",
" if cfg.seed !=0:\n",
" all_seed(env,seed=cfg.seed)\n",
" n_states = env.observation_space.shape[0]\n",
" n_actions = env.action_space.n\n",
" print(f\"状态空间维度:{n_states},动作空间维度:{n_actions}\")\n",
"\n",
" cfg.n_actions = env.action_space.n ## set the env action space\n",
" model = NoisyMLP(n_states, n_actions, hidden_dim = cfg.hidden_dim) # 创建模型\n",
" memory = ReplayBuffer(cfg.buffer_size)\n",
" agent = NoisyDQN(model,memory,cfg)\n",
" return env,agent"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4、设置参数"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"import argparse\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"class Config():\n",
" def __init__(self) -> None:\n",
" self.env_name = \"CartPole-v1\" # 环境名字\n",
" self.new_step_api = True # 是否用gym的新api\n",
" self.wrapper = None \n",
" self.render = False \n",
" self.algo_name = \"NoisyDQN\" # 算法名字\n",
" self.mode = \"train\" # train or test\n",
" self.seed = 0 # 随机种子\n",
" self.device = \"cpu\" # device to use\n",
" self.train_eps = 100 # 训练的回合数\n",
" self.test_eps = 20 # 测试的回合数\n",
" self.eval_eps = 10 # 评估的回合数\n",
" self.eval_per_episode = 5 # 每个回合的评估次数\n",
" self.max_steps = 200 # 每个回合的最大步数\n",
" self.load_checkpoint = False\n",
" self.load_path = \"tasks\" # 加载模型的路径\n",
" self.show_fig = False # 是否展示图片\n",
" self.save_fig = True # 是否存储图片\n",
"\n",
"\n",
" # 设置epsilon值\n",
" self.epsilon_start = 0.95 # 起始的epsilon值\n",
" self.epsilon_end = 0.01 # 终止的epsilon值\n",
" self.epsilon_decay = 500 # 衰减率\n",
" self.hidden_dim = 256 \n",
" self.gamma = 0.95 \n",
" self.lr = 0.0001 \n",
" self.buffer_size = 100000 # 经验回放的buffer大小\n",
" self.batch_size = 64 # batch size\n",
" self.target_update = 4 # 目标网络更新频率\n",
" self.value_layers = [\n",
" {'layer_type': 'linear', 'layer_dim': ['n_states', 256],\n",
" 'activation': 'relu'},\n",
" {'layer_type': 'linear', 'layer_dim': [256, 256],\n",
" 'activation': 'relu'},\n",
" {'layer_type': 'linear', 'layer_dim': [256, 'n_actions'],\n",
" 'activation': 'none'}]\n",
"\n",
"def smooth(data, weight=0.9): \n",
" '''用于平滑曲线类似于Tensorboard中的smooth曲线\n",
" '''\n",
" last = data[0] \n",
" smoothed = []\n",
" for point in data:\n",
" smoothed_val = last * weight + (1 - weight) * point # 计算平滑值\n",
" smoothed.append(smoothed_val) \n",
" last = smoothed_val \n",
" return smoothed\n",
"\n",
"def plot_rewards(rewards,cfg, tag='train'):\n",
" ''' 画图\n",
" '''\n",
" sns.set()\n",
" plt.figure() # 创建一个图形实例,方便同时多画几个图\n",
" plt.title(f\"{tag}ing curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}\")\n",
" plt.xlabel('epsiodes')\n",
" plt.plot(rewards, label='rewards')\n",
" plt.plot(smooth(rewards), label='smoothed')\n",
" plt.legend()\n",
" plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5、开始训练"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"状态空间维度4动作空间维度2\n",
"开始训练!\n",
"Begin to update!\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/dingli/anaconda3/envs/joyrl/lib/python3.7/site-packages/ipykernel_launcher.py:26: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
"/home/dingli/anaconda3/envs/joyrl/lib/python3.7/site-packages/ipykernel_launcher.py:27: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"回合10/100奖励11.00Epislon0.711\n",
"回合20/100奖励18.00Epislon0.498\n",
"回合30/100奖励20.00Epislon0.359\n",
"回合40/100奖励20.00Epislon0.214\n",
"回合50/100奖励94.00Epislon0.049\n",
"回合60/100奖励200.00Epislon0.011\n",
"回合70/100奖励200.00Epislon0.010\n",
"回合80/100奖励200.00Epislon0.010\n",
"回合90/100奖励200.00Epislon0.010\n",
"回合100/100奖励200.00Epislon0.010\n",
"完成训练!\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始测试!\n",
"回合1/20奖励200.00\n",
"回合2/20奖励200.00\n",
"回合3/20奖励200.00\n",
"回合4/20奖励200.00\n",
"回合5/20奖励200.00\n",
"回合6/20奖励200.00\n",
"回合7/20奖励200.00\n",
"回合8/20奖励200.00\n",
"回合9/20奖励200.00\n",
"回合10/20奖励200.00\n",
"回合11/20奖励200.00\n",
"回合12/20奖励200.00\n",
"回合13/20奖励200.00\n",
"回合14/20奖励200.00\n",
"回合15/20奖励200.00\n",
"回合16/20奖励200.00\n",
"回合17/20奖励200.00\n",
"回合18/20奖励200.00\n",
"回合19/20奖励200.00\n",
"回合20/20奖励200.00\n",
"完成测试\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 获取参数\n",
"cfg = Config() \n",
"# 训练\n",
"env, agent = env_agent_config(cfg)\n",
"res_dic = train(cfg, env, agent)\n",
" \n",
"plot_rewards(res_dic['rewards'], cfg, tag=\"train\") \n",
"# 测试\n",
"res_dic = test(cfg, env, agent)\n",
"plot_rewards(res_dic['rewards'], cfg, tag=\"test\") # 画出结果"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.13 ('joyrl')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "996e2c1bcfa8ebbd3aba48733c28d7658f0aec7cda7e9a0e5abbef50d3f90575"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}