diff --git a/codes/DQN/README.md b/codes/DQN-series/DQN/README.md similarity index 100% rename from codes/DQN/README.md rename to codes/DQN-series/DQN/README.md diff --git a/codes/DQN/agent.py b/codes/DQN-series/DQN/agent.py similarity index 73% rename from codes/DQN/agent.py rename to codes/DQN-series/DQN/agent.py index 66c11d7..27845d2 100644 --- a/codes/DQN/agent.py +++ b/codes/DQN-series/DQN/agent.py @@ -12,9 +12,6 @@ LastEditTime: 2021-09-15 13:35:36 '''off-policy ''' - - - import torch import torch.nn as nn import torch.optim as optim @@ -24,9 +21,9 @@ import numpy as np from common.memory import ReplayBuffer from common.model import MLP class DQN: - def __init__(self, state_dim, action_dim, cfg): + def __init__(self, n_states, n_actions, cfg): - self.action_dim = action_dim # 总的动作个数 + self.n_actions = n_actions # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # 奖励的折扣因子 # e-greedy策略相关参数 @@ -35,15 +32,15 @@ class DQN: (cfg.epsilon_start - cfg.epsilon_end) * \ math.exp(-1. * frame_idx / cfg.epsilon_decay) self.batch_size = cfg.batch_size - self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 - self.memory = ReplayBuffer(cfg.memory_capacity) + self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放 def choose_action(self, state): - '''选择动作 + ''' 选择动作 ''' self.frame_idx += 1 if random.random() > self.epsilon(self.frame_idx): @@ -52,13 +49,7 @@ class DQN: q_values = self.policy_net(state) action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: - action = random.randrange(self.action_dim) - return action - def predict(self,state): - with torch.no_grad(): - state = torch.tensor([state], device=self.device, dtype=torch.float32) - q_values = self.policy_net(state) - action = q_values.max(1)[1].item() + action = random.randrange(self.n_actions) return action def update(self): if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 @@ -67,16 +58,11 @@ class DQN: state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) # 转为张量 - state_batch = torch.tensor( - state_batch, device=self.device, dtype=torch.float) - action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( - 1) - reward_batch = torch.tensor( - reward_batch, device=self.device, dtype=torch.float) - next_state_batch = torch.tensor( - next_state_batch, device=self.device, dtype=torch.float) - done_batch = torch.tensor(np.float32( - done_batch), device=self.device) + state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) + action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) + reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) + next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) + done_batch = torch.tensor(np.float32(done_batch), device=self.device) q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a) next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值 # 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward diff --git a/codes/DQN/assets/eval_rewards_curve.png b/codes/DQN-series/DQN/assets/eval_rewards_curve.png similarity index 100% rename from codes/DQN/assets/eval_rewards_curve.png rename to codes/DQN-series/DQN/assets/eval_rewards_curve.png diff --git a/codes/DQN/assets/image-20210507162813393.png b/codes/DQN-series/DQN/assets/image-20210507162813393.png similarity index 100% rename from codes/DQN/assets/image-20210507162813393.png rename to codes/DQN-series/DQN/assets/image-20210507162813393.png diff --git a/codes/DQN/assets/rewards_curve_train.png b/codes/DQN-series/DQN/assets/rewards_curve_train.png similarity index 100% rename from codes/DQN/assets/rewards_curve_train.png rename to codes/DQN-series/DQN/assets/rewards_curve_train.png diff --git a/codes/DQN/assets/train_rewards_curve.png b/codes/DQN-series/DQN/assets/train_rewards_curve.png similarity index 100% rename from codes/DQN/assets/train_rewards_curve.png rename to codes/DQN-series/DQN/assets/train_rewards_curve.png diff --git a/codes/DQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png b/codes/DQN-series/DQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png similarity index 100% rename from codes/DQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png rename to codes/DQN-series/DQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/models/dqn_checkpoint.pth b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/models/dqn_checkpoint.pth new file mode 100644 index 0000000..0686337 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/models/dqn_checkpoint.pth differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_ma_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_ma_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_ma_rewards.npy rename to codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_ma_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards.npy rename to codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_rewards.npy diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_rewards_curve.png b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_rewards_curve.png new file mode 100644 index 0000000..a260f79 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/eval_rewards_curve.png differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_ma_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_ma_rewards.npy new file mode 100644 index 0000000..952fab3 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_rewards.npy similarity index 59% rename from codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards.npy rename to codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_rewards.npy index 5f8371f..43e4be6 100644 Binary files a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards.npy and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_rewards.npy differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_rewards_curve.png b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_rewards_curve.png new file mode 100644 index 0000000..d4b6789 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211109-200235/results/train_rewards_curve.png differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth new file mode 100644 index 0000000..a0b6ef9 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy new file mode 100644 index 0000000..343fcc6 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy similarity index 65% rename from codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_rewards.npy rename to codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy index 6498864..343fcc6 100644 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_rewards.npy and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png new file mode 100644 index 0000000..a260f79 Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy new file mode 100644 index 0000000..1e0ab6c Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy new file mode 100644 index 0000000..88c137f Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy differ diff --git a/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png new file mode 100644 index 0000000..4c14b8d Binary files /dev/null and b/codes/DQN-series/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png differ diff --git a/codes/DQN-series/DQN/task0_train.ipynb b/codes/DQN-series/DQN/task0_train.ipynb new file mode 100644 index 0000000..b9a04fc --- /dev/null +++ b/codes/DQN-series/DQN/task0_train.ipynb @@ -0,0 +1,379 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "curr_path = str(Path().absolute()) # 当前路径\n", + "parent_path = str(Path().absolute().parent) # 父路径\n", + "sys.path.append(parent_path) # 添加路径到系统路径\n", + "\n", + "import math,random\n", + "import gym\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torch.nn.functional as F\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from IPython.display import clear_output # 清空单元格输出区域" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 网络模型" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "class MLP(nn.Module):\n", + " def __init__(self, n_states,n_actions,hidden_dim=128):\n", + " \"\"\" 初始化q网络,为全连接网络\n", + " n_states: 输入的特征数即环境的状态数\n", + " n_actions: 输出的动作维度\n", + " \"\"\"\n", + " super(MLP, self).__init__()\n", + " self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层\n", + " self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层\n", + " self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层\n", + " \n", + " def forward(self, x):\n", + " # 各层对应的激活函数\n", + " x = F.relu(self.fc1(x)) \n", + " x = F.relu(self.fc2(x))\n", + " return self.fc3(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 经验回放" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "class ReplayBuffer:\n", + " def __init__(self, capacity):\n", + " self.capacity = capacity # 经验回放的容量\n", + " self.buffer = [] # 缓冲区\n", + " self.position = 0 \n", + " \n", + " def push(self, state, action, reward, next_state, done):\n", + " ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)\n", + " '''\n", + " if len(self.buffer) < self.capacity:\n", + " self.buffer.append(None)\n", + " self.buffer[self.position] = (state, action, reward, next_state, done)\n", + " self.position = (self.position + 1) % self.capacity \n", + " \n", + " def sample(self, batch_size):\n", + " batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移\n", + " state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等\n", + " return state, action, reward, next_state, done\n", + " \n", + " def __len__(self):\n", + " ''' 返回当前存储的量\n", + " '''\n", + " return len(self.buffer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DQN" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "class DQN:\n", + " def __init__(self, n_states, n_actions, cfg):\n", + "\n", + " self.n_actions = n_actions # 总的动作个数\n", + " self.device = cfg.device # 设备,cpu或gpu等\n", + " self.gamma = cfg.gamma # 奖励的折扣因子\n", + " # e-greedy策略相关参数\n", + " self.frame_idx = 0 # 用于epsilon的衰减计数\n", + " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", + " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", + " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", + " self.batch_size = cfg.batch_size\n", + " self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net\n", + " target_param.data.copy_(param.data)\n", + " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", + " self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放\n", + "\n", + " def choose_action(self, state):\n", + " ''' 选择动作\n", + " '''\n", + " self.frame_idx += 1\n", + " if random.random() > self.epsilon(self.frame_idx):\n", + " with torch.no_grad():\n", + " state = torch.tensor([state], device=self.device, dtype=torch.float32)\n", + " q_values = self.policy_net(state)\n", + " action = q_values.max(1)[1].item() # 选择Q值最大的动作\n", + " else:\n", + " action = random.randrange(self.n_actions)\n", + " return action\n", + " def update(self):\n", + " if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略\n", + " return\n", + " # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)\n", + " state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(\n", + " self.batch_size)\n", + " # 转为张量\n", + " state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)\n", + " action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) \n", + " reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) \n", + " next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)\n", + " done_batch = torch.tensor(np.float32(done_batch), device=self.device)\n", + " q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)\n", + " next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值\n", + " # 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward\n", + " expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)\n", + " loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失\n", + " # 优化更新模型\n", + " self.optimizer.zero_grad() \n", + " loss.backward()\n", + " for param in self.policy_net.parameters(): # clip防止梯度爆炸\n", + " param.grad.data.clamp_(-1, 1)\n", + " self.optimizer.step()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DQN参数" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "class DQNConfig:\n", + " def __init__(self):\n", + " self.algo = \"DQN\" # 算法名称\n", + " self.env = 'CartPole-v0' # 环境名称\n", + " self.train_eps = 200 # 训练的回合数\n", + " self.eval_eps = 30 # 测试的回合数\n", + " self.gamma = 0.95 # 强化学习中的折扣因子\n", + " self.epsilon_start = 0.90 # e-greedy策略中初始epsilon\n", + " self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n", + " self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率\n", + " self.lr = 0.0001 # 学习率\n", + " self.memory_capacity = 100000 # 经验回放的容量\n", + " self.batch_size = 64 # mini-batch SGD中的批量大小\n", + " self.target_update = 4 # 目标网络的更新频率\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n", + " self.hidden_dim = 256 # 网络隐藏层" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 创建环境" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def env_agent_config(cfg,seed=1):\n", + " ''' 创建环境和智能体\n", + " '''\n", + " env = gym.make(cfg.env) # 创建环境\n", + " env.seed(seed) # 设置随机种子\n", + " n_states = env.observation_space.shape[0] # 状态数\n", + " n_actions = env.action_space.n # 动作数\n", + " agent = DQN(n_states,n_actions,cfg) # 创建智能体\n", + " return env,agent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def train(cfg, env, agent):\n", + " ''' 训练\n", + " '''\n", + " print('开始训练!')\n", + " print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')\n", + " rewards = [] # 记录所有回合的奖励\n", + " ma_rewards = [] # 记录所有回合的滑动平均奖励\n", + " for i_ep in range(cfg.train_eps):\n", + " ep_reward = 0 # 记录一回合内的奖励\n", + " state = env.reset() # 重置环境,返回初始状态\n", + " while True:\n", + " action = agent.choose_action(state) # 选择动作\n", + " next_state, reward, done, _ = env.step(action) # 更新环境,返回transition\n", + " agent.memory.push(state, action, reward, next_state, done) # 保存transition\n", + " state = next_state # 更新下一个状态\n", + " agent.update() # 更新智能体\n", + " ep_reward += reward # 累加奖励\n", + " if done:\n", + " break\n", + " if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新\n", + " agent.target_net.load_state_dict(agent.policy_net.state_dict())\n", + " if (i_ep+1)%10 == 0: \n", + " print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))\n", + " rewards.append(ep_reward)\n", + " if ma_rewards:\n", + " ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)\n", + " else:\n", + " ma_rewards.append(ep_reward)\n", + " print('完成训练!')\n", + " return rewards, ma_rewards\n", + "\n", + "def plot_rewards(rewards,ma_rewards,plot_cfg):\n", + " # clear_output(True) # 清空单元格输出区域,因为多次打印,每次需要清楚前面打印的图片\n", + " sns.set() \n", + " plt.figure() # 创建一个图形实例,方便同时多画几个图\n", + " plt.title(\"learning curve on {} of {} for {}\".format(plot_cfg.device, plot_cfg.algo, plot_cfg.env))\n", + " plt.xlabel('epsiodes')\n", + " plt.plot(rewards,label='rewards')\n", + " plt.plot(ma_rewards,label='ma rewards')\n", + " plt.legend()\n", + " plt.show()\n", + "\n", + "class PlotConfig:\n", + " def __init__(self) -> None:\n", + " self.algo = \"DQN\" # 算法名称\n", + " self.env = 'CartPole-v0' # 环境名称\n", + " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n", + "\n", + "cfg = DQNConfig()\n", + "plot_cfg = PlotConfig()\n", + "env,agent = env_agent_config(cfg,seed=1)\n", + "rewards, ma_rewards = train(cfg, env, agent)\n", + "plot_rewards(rewards, ma_rewards, plot_cfg) # 画出结果" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def eval(cfg,env,agent):\n", + " print('开始测试!')\n", + " print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')\n", + " # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0\n", + " cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon\n", + " cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n", + " rewards = [] # 记录所有回合的奖励\n", + " ma_rewards = [] # 记录所有回合的滑动平均奖励\n", + " for i_ep in range(cfg.eval_eps):\n", + " ep_reward = 0 # 记录一回合内的奖励\n", + " state = env.reset() # 重置环境,返回初始状态\n", + " while True:\n", + " action = agent.choose_action(state) # 选择动作\n", + " next_state, reward, done, _ = env.step(action) # 更新环境,返回transition\n", + " state = next_state # 更新下一个状态\n", + " ep_reward += reward # 累加奖励\n", + " if done:\n", + " break\n", + " rewards.append(ep_reward)\n", + " if ma_rewards:\n", + " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", + " else:\n", + " ma_rewards.append(ep_reward)\n", + " if (i_ep+1)%3 == 0: \n", + " print(f\"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}\")\n", + " print('完成测试!')\n", + " return rewards,ma_rewards\n", + "\n", + "rewards,ma_rewards = eval(cfg,env,agent)\n", + "plot_rewards(rewards,ma_rewards, plot_cfg) # 画出结果\n" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "fe38df673a99c62a9fea33a7aceda74c9b65b12ee9d076c5851d98b692a4989a" + }, + "kernelspec": { + "display_name": "Python 3.7.10 64-bit ('py37': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "metadata": { + "interpreter": { + "hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232" + } + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codes/DQN/task0_train.py b/codes/DQN-series/DQN/task0_train.py similarity index 53% rename from codes/DQN/task0_train.py rename to codes/DQN-series/DQN/task0_train.py index 70b5b69..6827bb0 100644 --- a/codes/DQN/task0_train.py +++ b/codes/DQN-series/DQN/task0_train.py @@ -19,19 +19,14 @@ import torch import datetime from common.utils import save_results, make_dir -from common.plot import plot_rewards,plot_rewards_cn +from common.plot import plot_rewards from DQN.agent import DQN curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 - class DQNConfig: def __init__(self): self.algo = "DQN" # 算法名称 self.env = 'CartPole-v0' # 环境名称 - self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # 保存模型的路径 self.train_eps = 200 # 训练的回合数 self.eval_eps = 30 # 测试的回合数 self.gamma = 0.95 # 强化学习中的折扣因子 @@ -42,42 +37,53 @@ class DQNConfig: self.memory_capacity = 100000 # 经验回放的容量 self.batch_size = 64 # mini-batch SGD中的批量大小 self.target_update = 4 # 目标网络的更新频率 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.hidden_dim = 256 # hidden size of net - + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + def __init__(self) -> None: + self.algo = "DQN" # 算法名称 + self.env = 'CartPole-v0' # 环境名称 + self.result_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env) - env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = DQN(n_states,n_actions,cfg) + ''' 创建环境和智能体 + ''' + env = gym.make(cfg.env) # 创建环境 + env.seed(seed) # 设置随机种子 + n_states = env.observation_space.shape[0] # 状态数 + n_actions = env.action_space.n # 动作数 + agent = DQN(n_states,n_actions,cfg) # 创建智能体 return env,agent def train(cfg, env, agent): + ''' 训练 + ''' print('开始训练!') print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录奖励 - ma_rewards = [] # 记录滑动平均奖励 + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): - state = env.reset() - done = False - ep_reward = 0 + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 while True: - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + agent.memory.push(state, action, reward, next_state, done) # 保存transition + state = next_state # 更新下一个状态 + agent.update() # 更新智能体 + ep_reward += reward # 累加奖励 if done: break - if (i_ep+1) % cfg.target_update == 0: + if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 agent.target_net.load_state_dict(agent.policy_net.state_dict()) - if (i_ep+1)%10 == 0: + if (i_ep+1)%10 == 0: print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) - # save ma_rewards if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: @@ -88,16 +94,19 @@ def train(cfg, env, agent): def eval(cfg,env,agent): print('开始测试!') print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] - ma_rewards = [] # moving average rewards + # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.eval_eps): - ep_reward = 0 # reward per episode - state = env.reset() + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 while True: - action = agent.predict(state) - next_state, reward, done, _ = env.step(action) - state = next_state - ep_reward += reward + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + state = next_state # 更新下一个状态 + ep_reward += reward # 累加奖励 if done: break rewards.append(ep_reward) @@ -111,17 +120,17 @@ def eval(cfg,env,agent): if __name__ == "__main__": cfg = DQNConfig() + plot_cfg = PlotConfig() # 训练 env,agent = env_agent_config(cfg,seed=1) rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards_cn(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) + make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=plot_cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 # 测试 env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) + agent.load(path=plot_cfg.model_path) # 导入模型 rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards,ma_rewards, plot_cfg, tag="eval") # 画出结果 diff --git a/codes/DoubleDQN/README.md b/codes/DQN-series/DoubleDQN/README.md similarity index 100% rename from codes/DoubleDQN/README.md rename to codes/DQN-series/DoubleDQN/README.md diff --git a/codes/DoubleDQN/agent.py b/codes/DQN-series/DoubleDQN/agent.py similarity index 100% rename from codes/DoubleDQN/agent.py rename to codes/DQN-series/DoubleDQN/agent.py diff --git a/codes/DoubleDQN/assets/20201222145725907.png b/codes/DQN-series/DoubleDQN/assets/20201222145725907.png similarity index 100% rename from codes/DoubleDQN/assets/20201222145725907.png rename to codes/DQN-series/DoubleDQN/assets/20201222145725907.png diff --git a/codes/DoubleDQN/assets/20201222150225327.png b/codes/DQN-series/DoubleDQN/assets/20201222150225327.png similarity index 100% rename from codes/DoubleDQN/assets/20201222150225327.png rename to codes/DQN-series/DoubleDQN/assets/20201222150225327.png diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png b/codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png similarity index 100% rename from codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png rename to codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png b/codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png similarity index 100% rename from codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png rename to codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png b/codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png similarity index 100% rename from codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png rename to codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png diff --git a/codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png b/codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png similarity index 100% rename from codes/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png rename to codes/DQN-series/DoubleDQN/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png b/codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png similarity index 100% rename from codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png rename to codes/DQN-series/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png diff --git a/codes/DoubleDQN/task0_train.ipynb b/codes/DQN-series/DoubleDQN/task0_train.ipynb similarity index 100% rename from codes/DoubleDQN/task0_train.ipynb rename to codes/DQN-series/DoubleDQN/task0_train.ipynb diff --git a/codes/DoubleDQN/task0_train.py b/codes/DQN-series/DoubleDQN/task0_train.py similarity index 100% rename from codes/DoubleDQN/task0_train.py rename to codes/DQN-series/DoubleDQN/task0_train.py diff --git a/codes/DQN-series/DuelingDQN/assets/task0_train_20211112021954.png b/codes/DQN-series/DuelingDQN/assets/task0_train_20211112021954.png new file mode 100644 index 0000000..2529311 Binary files /dev/null and b/codes/DQN-series/DuelingDQN/assets/task0_train_20211112021954.png differ diff --git a/codes/DQN-series/DuelingDQN/task0_train.ipynb b/codes/DQN-series/DuelingDQN/task0_train.ipynb new file mode 100644 index 0000000..c2cd1c3 --- /dev/null +++ b/codes/DQN-series/DuelingDQN/task0_train.ipynb @@ -0,0 +1,418 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import math, random\n", + "import gym\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torch.autograd as autograd \n", + "import torch.nn.functional as F\n", + "from IPython.display import clear_output # 清空单元格输出区域\n", + "import matplotlib.pyplot as plt\n", + "# %matplotlib inline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "USE_CUDA = torch.cuda.is_available()\n", + "Variable = lambda *args, **kwargs: autograd.Variable(*args, **kwargs).cuda() if USE_CUDA else autograd.Variable(*args, **kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import deque\n", + "\n", + "class ReplayBuffer(object):\n", + " def __init__(self, capacity):\n", + " self.buffer = deque(maxlen=capacity)\n", + " \n", + " def push(self, state, action, reward, next_state, done):\n", + " state = np.expand_dims(state, 0)\n", + " next_state = np.expand_dims(next_state, 0)\n", + " \n", + " self.buffer.append((state, action, reward, next_state, done))\n", + " \n", + " def sample(self, batch_size):\n", + " state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))\n", + " return np.concatenate(state), action, reward, np.concatenate(next_state), done\n", + " \n", + " def __len__(self):\n", + " return len(self.buffer)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "env_name = \"CartPole-v0\"\n", + "env = gym.make(env_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "epsilon_start = 1.0\n", + "epsilon_final = 0.01\n", + "epsilon_decay = 500\n", + "\n", + "epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot([epsilon_by_frame(i) for i in range(10000)])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dueling DQN 网络\n", + "\n", + "DQN等算法中使用的是一个简单的三层神经网络:一个输入层,一个隐藏层和一个输出层。如下左图:\n", + "\n", + "\"image-20211112022028670\"\n", + "\n", + "而在Dueling DQN中,我们在后面加了两个子网络结构,分别对应上面上到价格函数网络部分和优势函数网络部分。对应上面右图所示。最终Q网络的输出由价格函数网络的输出和优势函数网络的输出线性组合得到。\n", + "\n", + "我们可以直接使用上一节的价值函数的组合公式得到我们的动作价值,但是这个式子无法辨识最终输出里面$V(S, w, \\alpha)$和$A(S, A, w, \\beta)$各自的作用,为了可以体现这种可辨识性(identifiability),实际使用的组合公式如下:\n", + "\n", + "$$\n", + "Q(S, A, w, \\alpha, \\beta)=V(S, w, \\alpha)+\\left(A(S, A, w, \\beta)-\\frac{1}{\\mathcal{A}} \\sum_{a^{\\prime} \\in \\mathcal{A}} A\\left(S, a^{\\prime}, w, \\beta\\right)\\right)\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "class DuelingNet(nn.Module):\n", + " def __init__(self, n_states, n_actions,hidden_size=128):\n", + " super(DuelingNet, self).__init__()\n", + " \n", + " # 隐藏层\n", + " self.hidden = nn.Sequential(\n", + " nn.Linear(n_states, hidden_size),\n", + " nn.ReLU()\n", + " )\n", + " \n", + " # 优势函数\n", + " self.advantage = nn.Sequential(\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_size, n_actions)\n", + " )\n", + " \n", + " # 价值函数\n", + " self.value = nn.Sequential(\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_size, 1)\n", + " )\n", + " \n", + " def forward(self, x):\n", + " x = self.hidden(x)\n", + " advantage = self.advantage(x)\n", + " value = self.value(x)\n", + " return value + advantage - advantage.mean()\n", + " \n", + " def act(self, state, epsilon):\n", + " if random.random() > epsilon:\n", + " with torch.no_grad():\n", + " state = Variable(torch.FloatTensor(state).unsqueeze(0))\n", + " q_value = self.forward(state)\n", + " action = q_value.max(1)[1].item()\n", + " else:\n", + " action = random.randrange(env.action_space.n)\n", + " return action" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "unexpected EOF while parsing (, line 1)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m class DuelingDQN:\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m unexpected EOF while parsing\n" + ] + } + ], + "source": [ + "class DuelingDQN:\n", + " def __init__(self,n_states,n_actions,cfg) -> None:\n", + " self.batch_size = cfg.batch_size\n", + " self.device = cfg.device\n", + " self.loss_history = [] # 记录loss的变化\n", + " self.frame_idx = 0 # 用于epsilon的衰减计数\n", + " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", + " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", + " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", + " self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n", + " target_param.data.copy_(param.data)\n", + " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", + " self.memory = ReplayBuffer(cfg.memory_capacity) \n", + " def choose_action(self,state):\n", + " self.frame_idx += 1\n", + " if random.random() > self.epsilon(self.frame_idx):\n", + " with torch.no_grad():\n", + " state = torch.tensor([state], device=self.device, dtype=torch.float32)\n", + " q_values = self.policy_net(state)\n", + " action = q_values.max(1)[1].item() # 选择Q值最大的动作\n", + " else:\n", + " action = random.randrange(self.action_dim)\n", + " return action\n", + " def update(self):\n", + " if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略\n", + " return\n", + " state, action, reward, next_state, done = self.memory.sample(batch_size)\n", + " state = torch.tensor(state, device=self.device, dtype=torch.float)\n", + " action = torch.tensor(action, device=self.device).unsqueeze(1) \n", + " reward = torch.tensor(reward, device=self.device, dtype=torch.float) \n", + " next_state = torch.tensor(next_state, device=self.device, dtype=torch.float)\n", + " done = torch.tensor(np.float32(done), device=self.device)\n", + " q_values = self.policy_net(state)\n", + " next_q_values = self.target_net(next_state)\n", + "\n", + " q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)\n", + " next_q_value = next_q_values.max(1)[0]\n", + " expected_q_value = reward + gamma * next_q_value * (1 - done)\n", + " \n", + " loss = (q_value - expected_q_value.detach()).pow(2).mean()\n", + " self.loss_history.append(loss)\n", + " self.optimizer.zero_grad()\n", + " loss.backward()\n", + " self.optimizer.step()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "current_model = DuelingNet(env.observation_space.shape[0], env.action_space.n)\n", + "target_model = DuelingNet(env.observation_space.shape[0], env.action_space.n)\n", + "\n", + "if USE_CUDA:\n", + " current_model = current_model.cuda()\n", + " target_model = target_model.cuda()\n", + " \n", + "optimizer = optim.Adam(current_model.parameters())\n", + "\n", + "replay_buffer = ReplayBuffer(1000)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def update_target(current_model, target_model):\n", + " target_model.load_state_dict(current_model.state_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "update_target(current_model, target_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_td_loss(batch_size):\n", + " state, action, reward, next_state, done = replay_buffer.sample(batch_size)\n", + "\n", + " state = Variable(torch.FloatTensor(np.float32(state)))\n", + " next_state = Variable(torch.FloatTensor(np.float32(next_state)))\n", + " action = Variable(torch.LongTensor(action))\n", + " reward = Variable(torch.FloatTensor(reward))\n", + " done = Variable(torch.FloatTensor(done))\n", + "\n", + " q_values = current_model(state)\n", + " next_q_values = target_model(next_state)\n", + "\n", + " q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)\n", + " next_q_value = next_q_values.max(1)[0]\n", + " expected_q_value = reward + gamma * next_q_value * (1 - done)\n", + " \n", + " loss = (q_value - expected_q_value.detach()).pow(2).mean()\n", + " \n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def plot(frame_idx, rewards, losses):\n", + " clear_output(True) # 清空单元格输出区域,因为多次打印,每次需要清楚前面打印的图片\n", + " plt.figure(figsize=(20,5))\n", + " plt.subplot(131)\n", + " plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))\n", + " plt.plot(rewards)\n", + " plt.subplot(132)\n", + " plt.title('loss')\n", + " plt.plot(losses)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "num_frames = 10000\n", + "batch_size = 32\n", + "gamma = 0.99\n", + "\n", + "losses = []\n", + "all_rewards = []\n", + "ep_reward = 0\n", + "\n", + "state = env.reset()\n", + "for frame_idx in range(1, num_frames + 1):\n", + " epsilon = epsilon_by_frame(frame_idx)\n", + " action = current_model.act(state, epsilon)\n", + " next_state, reward, done, _ = env.step(action)\n", + " replay_buffer.push(state, action, reward, next_state, done)\n", + " \n", + " state = next_state\n", + " ep_reward += reward\n", + " \n", + " if done:\n", + " state = env.reset()\n", + " all_rewards.append(ep_reward)\n", + " ep_reward = 0\n", + " \n", + " if len(replay_buffer) > batch_size:\n", + " loss = compute_td_loss(batch_size)\n", + " losses.append(loss.item())\n", + " \n", + " if frame_idx % 200 == 0:\n", + " plot(frame_idx, all_rewards, losses)\n", + " \n", + " if frame_idx % 100 == 0:\n", + " update_target(current_model, target_model)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 参考\n", + "\n", + "[强化学习(十二) Dueling DQN](https://www.cnblogs.com/pinard/p/9923859.html)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "fe38df673a99c62a9fea33a7aceda74c9b65b12ee9d076c5851d98b692a4989a" + }, + "kernelspec": { + "display_name": "Python 3.7.10 64-bit ('py37': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codes/HierarchicalDQN/README.md b/codes/DQN-series/HierarchicalDQN/README.md similarity index 100% rename from codes/HierarchicalDQN/README.md rename to codes/DQN-series/HierarchicalDQN/README.md diff --git a/codes/HierarchicalDQN/agent.py b/codes/DQN-series/HierarchicalDQN/agent.py similarity index 100% rename from codes/HierarchicalDQN/agent.py rename to codes/DQN-series/HierarchicalDQN/agent.py diff --git a/codes/HierarchicalDQN/assets/image-20210331153115575.png b/codes/DQN-series/HierarchicalDQN/assets/image-20210331153115575.png similarity index 100% rename from codes/HierarchicalDQN/assets/image-20210331153115575.png rename to codes/DQN-series/HierarchicalDQN/assets/image-20210331153115575.png diff --git a/codes/HierarchicalDQN/assets/image-20210331153542314.png b/codes/DQN-series/HierarchicalDQN/assets/image-20210331153542314.png similarity index 100% rename from codes/HierarchicalDQN/assets/image-20210331153542314.png rename to codes/DQN-series/HierarchicalDQN/assets/image-20210331153542314.png diff --git a/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy b/codes/DQN-series/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy similarity index 100% rename from codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy rename to codes/DQN-series/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy diff --git a/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png b/codes/DQN-series/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png similarity index 100% rename from codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png rename to codes/DQN-series/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png diff --git a/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy b/codes/DQN-series/HierarchicalDQN/results/20210331-134559/rewards_train.npy similarity index 100% rename from codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy rename to codes/DQN-series/HierarchicalDQN/results/20210331-134559/rewards_train.npy diff --git a/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png b/codes/DQN-series/HierarchicalDQN/results/20210331-145852/losses_curve.png similarity index 100% rename from codes/HierarchicalDQN/results/20210331-145852/losses_curve.png rename to codes/DQN-series/HierarchicalDQN/results/20210331-145852/losses_curve.png diff --git a/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy b/codes/DQN-series/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy similarity index 100% rename from codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy rename to codes/DQN-series/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy diff --git a/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png b/codes/DQN-series/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png similarity index 100% rename from codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png rename to codes/DQN-series/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png diff --git a/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy b/codes/DQN-series/HierarchicalDQN/results/20210331-145852/rewards_train.npy similarity index 100% rename from codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy rename to codes/DQN-series/HierarchicalDQN/results/20210331-145852/rewards_train.npy diff --git a/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth b/codes/DQN-series/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth similarity index 100% rename from codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth rename to codes/DQN-series/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth diff --git a/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth b/codes/DQN-series/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth similarity index 100% rename from codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth rename to codes/DQN-series/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth diff --git a/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth b/codes/DQN-series/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth similarity index 100% rename from codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth rename to codes/DQN-series/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth diff --git a/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth b/codes/DQN-series/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth similarity index 100% rename from codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth rename to codes/DQN-series/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth diff --git a/codes/HierarchicalDQN/task0_train.ipynb b/codes/DQN-series/HierarchicalDQN/task0_train.ipynb similarity index 100% rename from codes/HierarchicalDQN/task0_train.ipynb rename to codes/DQN-series/HierarchicalDQN/task0_train.ipynb diff --git a/codes/HierarchicalDQN/task0_train.py b/codes/DQN-series/HierarchicalDQN/task0_train.py similarity index 100% rename from codes/HierarchicalDQN/task0_train.py rename to codes/DQN-series/HierarchicalDQN/task0_train.py diff --git a/codes/DQN-series/NoisyDQN/task0_train.ipynb b/codes/DQN-series/NoisyDQN/task0_train.ipynb new file mode 100644 index 0000000..ecd0092 --- /dev/null +++ b/codes/DQN-series/NoisyDQN/task0_train.ipynb @@ -0,0 +1,25 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "curr_path = str(Path().absolute()) # 当前路径\n", + "parent_path = str(Path().absolute().parent) # 父路径\n", + "sys.path.append(parent_path) # 添加路径到系统路径" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codes/DQN-series/README.md b/codes/DQN-series/README.md new file mode 100644 index 0000000..38de319 --- /dev/null +++ b/codes/DQN-series/README.md @@ -0,0 +1,3 @@ + +本目录下汇总了基础的DQN及其变种或升级,如下 + diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20210912-013122/models/dqn_checkpoint.pth deleted file mode 100644 index 7cb626f..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_ma_rewards.npy deleted file mode 100644 index 516efb9..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_rewards_curve.png deleted file mode 100644 index d304a19..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/eval_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_ma_rewards.npy deleted file mode 100644 index 1d8f61b..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_rewards.npy deleted file mode 100644 index 32dbedf..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_rewards_curve.png deleted file mode 100644 index 147540c..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210912-013122/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20210915-145623/models/dqn_checkpoint.pth deleted file mode 100644 index ecfc662..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210915-145623/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards_curve_cn.png b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards_curve_cn.png deleted file mode 100644 index 1f55598..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards_curve_cn.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_ma_rewards.npy deleted file mode 100644 index 65ead2d..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards_curve_cn.png b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards_curve_cn.png deleted file mode 100644 index 617f693..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/DQN/task0_train.ipynb b/codes/DQN/task0_train.ipynb deleted file mode 100644 index 94ebd60..0000000 --- a/codes/DQN/task0_train.ipynb +++ /dev/null @@ -1,270 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3710jvsc74a57bd0366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232", - "display_name": "Python 3.7.10 64-bit ('py37': conda)" - }, - "metadata": { - "interpreter": { - "hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute())\n", - "parent_path = str(Path().absolute().parent)\n", - "sys.path.append(parent_path) # add current terminal path to sys.path" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import torch\n", - "import datetime\n", - "\n", - "from common.utils import save_results, make_dir\n", - "from common.plot import plot_rewards\n", - "from DQN.agent import DQN\n", - "\n", - "curr_time = datetime.datetime.now().strftime(\n", - " \"%Y%m%d-%H%M%S\") # obtain current time" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "class DQNConfig:\n", - " def __init__(self):\n", - " self.algo = \"DQN\" # name of algo\n", - " self.env = 'CartPole-v0'\n", - " self.result_path = curr_path+\"/outputs/\" + self.env + \\\n", - " '/'+curr_time+'/results/' # path to save results\n", - " self.model_path = curr_path+\"/outputs/\" + self.env + \\\n", - " '/'+curr_time+'/models/' # path to save results\n", - " self.train_eps = 300 # max trainng episodes\n", - " self.eval_eps = 50 # number of episodes for evaluating\n", - " self.gamma = 0.95\n", - " self.epsilon_start = 0.90 # start epsilon of e-greedy policy\n", - " self.epsilon_end = 0.01\n", - " self.epsilon_decay = 500\n", - " self.lr = 0.0001 # learning rate\n", - " self.memory_capacity = 100000 # capacity of Replay Memory\n", - " self.batch_size = 64\n", - " self.target_update = 2 # update frequency of target net\n", - " self.device = torch.device(\n", - " \"cuda\" if torch.cuda.is_available() else \"cpu\") # check gpu\n", - " self.hidden_dim = 256 # hidden size of net" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def env_agent_config(cfg,seed=1):\n", - " env = gym.make(cfg.env) \n", - " env.seed(seed)\n", - " state_dim = env.observation_space.shape[0]\n", - " action_dim = env.action_space.n\n", - " agent = DQN(state_dim,action_dim,cfg)\n", - " return env,agent" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def train(cfg, env, agent):\n", - " print('Start to train !')\n", - " print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n", - " rewards = []\n", - " ma_rewards = [] # moveing average reward\n", - " for i_ep in range(cfg.train_eps):\n", - " state = env.reset()\n", - " done = False\n", - " ep_reward = 0\n", - " while True:\n", - " action = agent.choose_action(state)\n", - " next_state, reward, done, _ = env.step(action)\n", - " ep_reward += reward\n", - " agent.memory.push(state, action, reward, next_state, done)\n", - " state = next_state\n", - " agent.update()\n", - " if done:\n", - " break\n", - " if i_ep % cfg.target_update == 0:\n", - " agent.target_net.load_state_dict(agent.policy_net.state_dict())\n", - " if (i_ep+1)%10 == 0:\n", - " print('Episode:{}/{}, Reward:{}'.format(i_ep+1, cfg.train_eps, ep_reward))\n", - " rewards.append(ep_reward)\n", - " # save ma rewards\n", - " if ma_rewards:\n", - " ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " print('Complete training!')\n", - " return rewards, ma_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def eval(cfg,env,agent):\n", - " rewards = [] \n", - " ma_rewards = [] # moving average rewards\n", - " for i_ep in range(cfg.eval_eps):\n", - " ep_reward = 0 # reward per episode\n", - " state = env.reset() \n", - " while True:\n", - " action = agent.predict(state) \n", - " next_state, reward, done, _ = env.step(action) \n", - " state = next_state \n", - " ep_reward += reward\n", - " if done:\n", - " break\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " if (i_ep+1)%10==0:\n", - " print(f\"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}\")\n", - " return rewards,ma_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "tags": [] - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Start to train !\n", - "Env:CartPole-v0, Algorithm:DQN, Device:cuda\n", - "Episode:10/300, Reward:13.0\n", - "Episode:20/300, Reward:14.0\n", - "Episode:30/300, Reward:14.0\n", - "Episode:40/300, Reward:12.0\n", - "Episode:50/300, Reward:125.0\n", - "Episode:60/300, Reward:98.0\n", - "Episode:70/300, Reward:200.0\n", - "Episode:80/300, Reward:160.0\n", - "Episode:90/300, Reward:200.0\n", - "Episode:100/300, Reward:200.0\n", - "Episode:110/300, Reward:200.0\n", - "Episode:120/300, Reward:198.0\n", - "Episode:130/300, Reward:200.0\n", - "Episode:140/300, Reward:200.0\n", - "Episode:150/300, Reward:200.0\n", - "Episode:160/300, Reward:200.0\n", - "Episode:170/300, Reward:200.0\n", - "Episode:180/300, Reward:200.0\n", - "Episode:190/300, Reward:200.0\n", - "Episode:200/300, Reward:200.0\n", - "Episode:210/300, Reward:200.0\n", - "Episode:220/300, Reward:200.0\n", - "Episode:230/300, Reward:188.0\n", - "Episode:240/300, Reward:200.0\n", - "Episode:250/300, Reward:200.0\n", - "Episode:260/300, Reward:193.0\n", - "Episode:270/300, Reward:200.0\n", - "Episode:280/300, Reward:200.0\n", - "Episode:290/300, Reward:200.0\n", - "Episode:300/300, Reward:200.0\n", - "Complete training!\n", - "results saved!\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": "
", - "image/svg+xml": "\n\n\n \n \n \n \n 2021-05-04T19:04:03.044086\n image/svg+xml\n \n \n Matplotlib v3.4.1, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "image/png": "\n" - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Episode:10/50, reward:188.0\n", - "Episode:20/50, reward:200.0\n", - "Episode:30/50, reward:200.0\n", - "Episode:40/50, reward:200.0\n", - "Episode:50/50, reward:171.0\n", - "results saved!\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": "
", - "image/svg+xml": "\n\n\n \n \n \n \n 2021-05-04T19:04:05.465993\n image/svg+xml\n \n \n Matplotlib v3.4.1, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "image/png": "\n" - }, - "metadata": {} - } - ], - "source": [ - "if __name__ == \"__main__\":\n", - " cfg = DQNConfig()\n", - "\n", - " # train\n", - " env,agent = env_agent_config(cfg,seed=1)\n", - " rewards, ma_rewards = train(cfg, env, agent)\n", - " make_dir(cfg.result_path, cfg.model_path)\n", - " agent.save(path=cfg.model_path)\n", - " save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)\n", - " plot_rewards(rewards, ma_rewards, tag=\"train\",\n", - " algo=cfg.algo, path=cfg.result_path)\n", - " # eval\n", - " env,agent = env_agent_config(cfg,seed=10)\n", - " agent.load(path=cfg.model_path)\n", - " rewards,ma_rewards = eval(cfg,env,agent)\n", - " save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)\n", - " plot_rewards(rewards,ma_rewards,tag=\"eval\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)" - ] - } - ] -} \ No newline at end of file diff --git a/codes/PPO/task0_train.py b/codes/PPO/task0_train.py index ccca805..04dfae0 100644 --- a/codes/PPO/task0_train.py +++ b/codes/PPO/task0_train.py @@ -100,7 +100,7 @@ def eval(cfg,env,agent): 0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}") + print(f"Episode:{i_ep+1}/{cfg.eval_eps}, Reward:{ep_reward:.3f}") print('Complete evaling!') return rewards,ma_rewards diff --git a/codes/PolicyGradient/README.md b/codes/PolicyGradient/README.md index 0f9fec3..956cdbf 100644 --- a/codes/PolicyGradient/README.md +++ b/codes/PolicyGradient/README.md @@ -8,12 +8,16 @@ Policy-based方法是强化学习中与Value-based(比如Q-learning)相对的方 结合REINFORCE原理,其伪代码如下: +image-20211016004808604 + +https://pytorch.org/docs/stable/distributions.html + +加负号的原因是,在公式中应该是实现的梯度上升算法,而loss一般使用随机梯度下降的,所以加个负号保持一致性。 + ![img](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210428001336032.png) ## 实现 - - ## 参考 [REINFORCE和Reparameterization Trick](https://blog.csdn.net/JohnJim0/article/details/110230703) diff --git a/codes/PolicyGradient/agent.py b/codes/PolicyGradient/agent.py index be67601..8f349b5 100644 --- a/codes/PolicyGradient/agent.py +++ b/codes/PolicyGradient/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-22 23:27:44 LastEditor: John -LastEditTime: 2021-05-05 17:33:10 +LastEditTime: 2021-10-16 00:43:52 Discription: Environment: ''' @@ -56,7 +56,6 @@ class PolicyGradient: state = state_pool[i] action = Variable(torch.FloatTensor([action_pool[i]])) reward = reward_pool[i] - state = Variable(torch.from_numpy(state).float()) probs = self.policy_net(state) m = Bernoulli(probs) diff --git a/codes/PolicyGradient/assets/image-20211016004808604.png b/codes/PolicyGradient/assets/image-20211016004808604.png new file mode 100644 index 0000000..b0a56b5 Binary files /dev/null and b/codes/PolicyGradient/assets/image-20211016004808604.png differ diff --git a/codes/PolicyGradient/task0_train.py b/codes/PolicyGradient/task0_train.py index c1f4e5c..a7fb0d2 100644 --- a/codes/PolicyGradient/task0_train.py +++ b/codes/PolicyGradient/task0_train.py @@ -5,14 +5,14 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-22 23:21:53 LastEditor: John -LastEditTime: 2021-05-05 17:35:20 +LastEditTime: 2021-10-16 00:34:13 Discription: Environment: ''' import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加父路径到系统路径sys.path import gym import torch @@ -23,21 +23,20 @@ from PolicyGradient.agent import PolicyGradient from common.plot import plot_rewards from common.utils import save_results,make_dir -curr_time = datetime.datetime.now().strftime( - "%Y%m%d-%H%M%S") # obtain current time +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 class PGConfig: def __init__(self): - self.algo = "PolicyGradient" # name of algo - self.env = 'CartPole-v0' + self.algo = "PolicyGradient" # 算法名称 + self.env = 'CartPole-v0' # 环境名称 self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results + '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save models - self.train_eps = 300 # 训练的episode数目 - self.eval_eps = 50 + '/'+curr_time+'/models/' # 保存模型的路径 + self.train_eps = 300 # 训练的回合数 + self.eval_eps = 30 # 测试的回合数 self.batch_size = 8 - self.lr = 0.01 # learning rate + self.lr = 0.01 # 学习率 self.gamma = 0.99 self.hidden_dim = 36 # dimmension of hidden layer self.device = torch.device( @@ -59,7 +58,7 @@ def train(cfg,env,agent): reward_pool = [] rewards = [] ma_rewards = [] - for i_episode in range(cfg.train_eps): + for i_ep in range(cfg.train_eps): state = env.reset() ep_reward = 0 for _ in count(): @@ -73,9 +72,9 @@ def train(cfg,env,agent): reward_pool.append(reward) state = next_state if done: - print('Episode:', i_episode, ' Reward:', ep_reward) + print('Episode:', i_ep, ' Reward:', ep_reward) break - if i_episode > 0 and i_episode % cfg.batch_size == 0: + if i_ep > 0 and i_ep % cfg.batch_size == 0: agent.update(reward_pool,state_pool,action_pool) state_pool = [] # 每个episode的state action_pool = [] @@ -95,7 +94,7 @@ def eval(cfg,env,agent): print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') rewards = [] ma_rewards = [] - for i_episode in range(cfg.eval_eps): + for i_ep in range(cfg.eval_eps): state = env.reset() ep_reward = 0 for _ in count(): @@ -106,7 +105,7 @@ def eval(cfg,env,agent): reward = 0 state = next_state if done: - print('Episode:', i_episode, ' Reward:', ep_reward) + print('Episode:', i_ep, ' Reward:', ep_reward) break rewards.append(ep_reward) if ma_rewards: @@ -116,6 +115,7 @@ def eval(cfg,env,agent): ma_rewards.append(ep_reward) print('complete evaling!') return rewards, ma_rewards + if __name__ == "__main__": cfg = PGConfig() diff --git a/codes/README.md b/codes/README.md index e7b9e6f..2c421ae 100644 --- a/codes/README.md +++ b/codes/README.md @@ -18,14 +18,14 @@ ## 运行环境 -python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.19.0 +python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0 ## 使用说明 运行带有```train```的py文件或ipynb文件进行训练,如果前面带有```task```如```task0_train.py```,表示对task0任务训练, 类似的带有```eval```即为测试。 -## 算法进度 +## 内容导航 | 算法名称 | 相关论文材料 | 环境 | 备注 | | :--------------------------------------: | :----------------------------------------------------------: | ----------------------------------------- | :--------------------------------: | diff --git a/codes/common/model.py b/codes/common/model.py index 9800dbf..be03368 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -15,15 +15,15 @@ import torch.nn.functional as F from torch.distributions import Categorical class MLP(nn.Module): - def __init__(self, input_dim,output_dim,hidden_dim=128): + def __init__(self, n_states,n_actions,hidden_dim=128): """ 初始化q网络,为全连接网络 - input_dim: 输入的特征数即环境的状态数 - output_dim: 输出的动作维度 + n_states: 输入的特征数即环境的状态数 + n_actions: 输出的动作维度 """ super(MLP, self).__init__() - self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 + self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层 + self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -32,10 +32,10 @@ class MLP(nn.Module): return self.fc3(x) class Critic(nn.Module): - def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3): + def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): super(Critic, self).__init__() - self.linear1 = nn.Linear(n_obs + output_dim, hidden_size) + self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, 1) # 随机初始化为较小的值 @@ -51,11 +51,11 @@ class Critic(nn.Module): return x class Actor(nn.Module): - def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3): + def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): super(Actor, self).__init__() self.linear1 = nn.Linear(n_obs, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, output_dim) + self.linear3 = nn.Linear(hidden_size, n_actions) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -67,18 +67,18 @@ class Actor(nn.Module): return x class ActorCritic(nn.Module): - def __init__(self, input_dim, output_dim, hidden_dim=256): + def __init__(self, n_states, n_actions, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( - nn.Linear(input_dim, hidden_dim), + nn.Linear(n_states, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( - nn.Linear(input_dim, hidden_dim), + nn.Linear(n_states, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, output_dim), + nn.Linear(hidden_dim, n_actions), nn.Softmax(dim=1), ) diff --git a/codes/common/plot.py b/codes/common/plot.py index 6707ff8..d14b8d4 100644 --- a/codes/common/plot.py +++ b/codes/common/plot.py @@ -11,36 +11,52 @@ Environment: ''' import matplotlib.pyplot as plt import seaborn as sns -from matplotlib.font_manager import FontProperties -def chinese_font(): - return FontProperties(fname='/System/Library/Fonts/STHeiti Light.ttc',size=15) # 系统字体路径,此处是mac的 -def plot_rewards(rewards,ma_rewards,tag="train",env='CartPole-v0',algo = "DQN",save=True,path='./'): - sns.set() - plt.title("average learning curve of {} for {}".format(algo,env)) +# from matplotlib.font_manager import FontProperties # 导入字体模块 + +# def chinese_font(): +# ''' 设置中文字体 +# ''' +# return FontProperties(fname='/System/Library/Fonts/STHeiti Light.ttc',size=15) # fname系统字体路径,此处是mac的 +# def plot_rewards_cn(rewards,ma_rewards,tag="train",env='CartPole-v0',algo = "DQN",save=True,path='./'): +# ''' 中文画图 +# ''' +# sns.set() +# plt.figure() +# plt.title(u"{}环境下{}算法的学习曲线".format(env,algo),fontproperties=chinese_font()) +# plt.xlabel(u'回合数',fontproperties=chinese_font()) +# plt.plot(rewards) +# plt.plot(ma_rewards) +# plt.legend((u'奖励',u'滑动平均奖励',),loc="best",prop=chinese_font()) +# if save: +# plt.savefig(path+f"{tag}_rewards_curve_cn") +# # plt.show() + +def plot_rewards(rewards,ma_rewards,plot_cfg,tag='train'): + sns.set() + plt.figure() # 创建一个图形实例,方便同时多画几个图 + plt.title("learning curve on {} of {} for {}".format(plot_cfg.device, plot_cfg.algo, plot_cfg.env)) plt.xlabel('epsiodes') plt.plot(rewards,label='rewards') plt.plot(ma_rewards,label='ma rewards') plt.legend() - if save: - plt.savefig(path+"{}_rewards_curve".format(tag)) + if plot_cfg.save: + plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) plt.show() - -def plot_rewards_cn(rewards,ma_rewards,tag="train",env='CartPole-v0',algo = "DQN",save=True,path='./'): - ''' 中文画图 - ''' - sns.set() - plt.figure() - plt.title(u"{}环境下{}算法的学习曲线".format(env,algo),fontproperties=chinese_font()) - plt.xlabel(u'回合数',fontproperties=chinese_font()) - plt.plot(rewards) - plt.plot(ma_rewards) - plt.legend((u'奖励',u'滑动平均奖励',),loc="best",prop=chinese_font()) - if save: - plt.savefig(path+f"{tag}_rewards_curve_cn") - # plt.show() +# def plot_rewards(rewards,ma_rewards,tag="train",env='CartPole-v0',algo = "DQN",save=True,path='./'): +# sns.set() +# plt.figure() # 创建一个图形实例,方便同时多画几个图 +# plt.title("average learning curve of {} for {}".format(algo,env)) +# plt.xlabel('epsiodes') +# plt.plot(rewards,label='rewards') +# plt.plot(ma_rewards,label='ma rewards') +# plt.legend() +# if save: +# plt.savefig(path+"{}_rewards_curve".format(tag)) +# plt.show() def plot_losses(losses,algo = "DQN",save=True,path='./'): sns.set() + plt.figure() plt.title("loss curve of {}".format(algo)) plt.xlabel('epsiodes') plt.plot(losses,label='rewards') diff --git a/codes/envs/README.md b/codes/envs/README.md new file mode 100644 index 0000000..e93fba0 --- /dev/null +++ b/codes/envs/README.md @@ -0,0 +1,6 @@ +## 环境汇总 + +[OpenAI Gym](./gym_info.md) +[MuJoCo](./mujoco_info.md) + +