Files
easy-rl/codes/DQN/train.ipynb
johnjim0816 3b712e8815 update codes
2021-12-21 20:14:13 +08:00

424 lines
65 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"curr_path = str(Path().absolute()) # 当前路径\n",
"parent_path = str(Path().absolute().parent) # 父路径\n",
"sys.path.append(parent_path) # 添加路径到系统路径\n",
"\n",
"import math,random\n",
"import gym\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import torch.nn.functional as F\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from IPython.display import clear_output # 清空单元格输出区域"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 网络模型"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"class MLP(nn.Module):\n",
" def __init__(self, state_dim,action_dim,hidden_dim=128):\n",
" \"\"\" 初始化q网络为全连接网络\n",
" state_dim: 输入的特征数即环境的状态数\n",
" action_dim: 输出的动作维度\n",
" \"\"\"\n",
" super(MLP, self).__init__()\n",
" self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层\n",
" self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层\n",
" self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层\n",
" \n",
" def forward(self, x):\n",
" # 各层对应的激活函数\n",
" x = F.relu(self.fc1(x)) \n",
" x = F.relu(self.fc2(x))\n",
" return self.fc3(x)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 经验回放"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"class ReplayBuffer:\n",
" def __init__(self, capacity):\n",
" self.capacity = capacity # 经验回放的容量\n",
" self.buffer = [] # 缓冲区\n",
" self.position = 0 \n",
" \n",
" def push(self, state, action, reward, next_state, done):\n",
" ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)\n",
" '''\n",
" if len(self.buffer) < self.capacity:\n",
" self.buffer.append(None)\n",
" self.buffer[self.position] = (state, action, reward, next_state, done)\n",
" self.position = (self.position + 1) % self.capacity \n",
" \n",
" def sample(self, batch_size):\n",
" batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移\n",
" state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等\n",
" return state, action, reward, next_state, done\n",
" \n",
" def __len__(self):\n",
" ''' 返回当前存储的量\n",
" '''\n",
" return len(self.buffer)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DQN"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"class DQN:\n",
" def __init__(self, state_dim, action_dim, cfg):\n",
"\n",
" self.action_dim = action_dim # 总的动作个数\n",
" self.device = cfg.device # 设备cpu或gpu等\n",
" self.gamma = cfg.gamma # 奖励的折扣因子\n",
" # e-greedy策略相关参数\n",
" self.frame_idx = 0 # 用于epsilon的衰减计数\n",
" self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
" (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
" math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
" self.batch_size = cfg.batch_size\n",
" self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net\n",
" target_param.data.copy_(param.data)\n",
" self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
" self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放\n",
"\n",
" def choose_action(self, state):\n",
" ''' 选择动作\n",
" '''\n",
" self.frame_idx += 1\n",
" if random.random() > self.epsilon(self.frame_idx):\n",
" with torch.no_grad():\n",
" state = torch.tensor([state], device=self.device, dtype=torch.float32)\n",
" q_values = self.policy_net(state)\n",
" action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
" else:\n",
" action = random.randrange(self.action_dim)\n",
" return action\n",
" def update(self):\n",
" if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略\n",
" return\n",
" # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)\n",
" state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(\n",
" self.batch_size)\n",
" # 转为张量\n",
" state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)\n",
" action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) \n",
" reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) \n",
" next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)\n",
" done_batch = torch.tensor(np.float32(done_batch), device=self.device)\n",
" q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)\n",
" next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值\n",
" # 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward\n",
" expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)\n",
" loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失\n",
" # 优化更新模型\n",
" self.optimizer.zero_grad() \n",
" loss.backward()\n",
" for param in self.policy_net.parameters(): # clip防止梯度爆炸\n",
" param.grad.data.clamp_(-1, 1)\n",
" self.optimizer.step()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### DQN参数"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"class DQNConfig:\n",
" def __init__(self):\n",
" self.algo = \"DQN\" # 算法名称\n",
" self.env = 'CartPole-v0' # 环境名称\n",
" self.train_eps = 200 # 训练的回合数\n",
" self.eval_eps = 20 # 测试的回合数\n",
" self.gamma = 0.95 # 强化学习中的折扣因子\n",
" self.epsilon_start = 0.90 # e-greedy策略中初始epsilon\n",
" self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n",
" self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率\n",
" self.lr = 0.0001 # 学习率\n",
" self.memory_capacity = 100000 # 经验回放的容量\n",
" self.batch_size = 64 # mini-batch SGD中的批量大小\n",
" self.target_update = 4 # 目标网络的更新频率\n",
" self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n",
" self.hidden_dim = 256 # 网络隐藏层"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 创建环境"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def env_agent_config(cfg,seed=1):\n",
" ''' 创建环境和智能体\n",
" '''\n",
" env = gym.make(cfg.env) # 创建环境\n",
" env.seed(seed) # 设置随机种子\n",
" state_dim = env.observation_space.shape[0] # 状态数\n",
" action_dim = env.action_space.n # 动作数\n",
" agent = DQN(state_dim,action_dim,cfg) # 创建智能体\n",
" return env,agent"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 训练"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始训练!\n",
"环境CartPole-v0, 算法DQN, 设备cuda\n",
"回合10/200, 奖励12.0\n",
"回合20/200, 奖励16.0\n",
"回合30/200, 奖励15.0\n",
"回合40/200, 奖励14.0\n",
"回合50/200, 奖励13.0\n",
"回合60/200, 奖励27.0\n",
"回合70/200, 奖励36.0\n",
"回合80/200, 奖励33.0\n",
"回合90/200, 奖励200.0\n",
"回合100/200, 奖励200.0\n",
"回合110/200, 奖励200.0\n",
"回合120/200, 奖励200.0\n",
"回合130/200, 奖励200.0\n",
"回合140/200, 奖励200.0\n",
"回合150/200, 奖励200.0\n",
"回合160/200, 奖励200.0\n",
"回合170/200, 奖励200.0\n",
"回合180/200, 奖励200.0\n",
"回合190/200, 奖励200.0\n",
"回合200/200, 奖励200.0\n",
"完成训练!\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def train(cfg, env, agent):\n",
" ''' 训练\n",
" '''\n",
" print('开始训练!')\n",
" print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')\n",
" rewards = [] # 记录所有回合的奖励\n",
" ma_rewards = [] # 记录所有回合的滑动平均奖励\n",
" for i_ep in range(cfg.train_eps):\n",
" ep_reward = 0 # 记录一回合内的奖励\n",
" state = env.reset() # 重置环境,返回初始状态\n",
" while True:\n",
" action = agent.choose_action(state) # 选择动作\n",
" next_state, reward, done, _ = env.step(action) # 更新环境返回transition\n",
" agent.memory.push(state, action, reward, next_state, done) # 保存transition\n",
" state = next_state # 更新下一个状态\n",
" agent.update() # 更新智能体\n",
" ep_reward += reward # 累加奖励\n",
" if done:\n",
" break\n",
" if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新\n",
" agent.target_net.load_state_dict(agent.policy_net.state_dict())\n",
" if (i_ep+1)%10 == 0: \n",
" print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))\n",
" rewards.append(ep_reward)\n",
" if ma_rewards:\n",
" ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)\n",
" else:\n",
" ma_rewards.append(ep_reward)\n",
" print('完成训练!')\n",
" return rewards, ma_rewards\n",
"\n",
"def plot_rewards(rewards,ma_rewards,plot_cfg):\n",
" # clear_output(True) # 清空单元格输出区域,因为多次打印,每次需要清楚前面打印的图片\n",
" sns.set() \n",
" plt.figure() # 创建一个图形实例,方便同时多画几个图\n",
" plt.title(\"learning curve on {} of {} for {}\".format(plot_cfg.device, plot_cfg.algo, plot_cfg.env))\n",
" plt.xlabel('epsiodes')\n",
" plt.plot(rewards,label='rewards')\n",
" plt.plot(ma_rewards,label='ma rewards')\n",
" plt.legend()\n",
" plt.show()\n",
"\n",
"class PlotConfig:\n",
" def __init__(self) -> None:\n",
" self.algo = \"DQN\" # 算法名称\n",
" self.env = 'CartPole-v0' # 环境名称\n",
" self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n",
"\n",
"cfg = DQNConfig()\n",
"plot_cfg = PlotConfig()\n",
"env,agent = env_agent_config(cfg,seed=1)\n",
"rewards, ma_rewards = train(cfg, env, agent)\n",
"plot_rewards(rewards, ma_rewards, plot_cfg) # 画出结果"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始测试!\n",
"环境CartPole-v0, 算法DQN, 设备cuda\n",
"回合3/20, 奖励200.0\n",
"回合6/20, 奖励200.0\n",
"回合9/20, 奖励200.0\n",
"回合12/20, 奖励200.0\n",
"回合15/20, 奖励200.0\n",
"回合18/20, 奖励200.0\n",
"完成测试!\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def eval(cfg,env,agent):\n",
" print('开始测试!')\n",
" print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')\n",
" # 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0\n",
" cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon\n",
" cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n",
" rewards = [] # 记录所有回合的奖励\n",
" ma_rewards = [] # 记录所有回合的滑动平均奖励\n",
" for i_ep in range(cfg.eval_eps):\n",
" ep_reward = 0 # 记录一回合内的奖励\n",
" state = env.reset() # 重置环境,返回初始状态\n",
" while True:\n",
" action = agent.choose_action(state) # 选择动作\n",
" next_state, reward, done, _ = env.step(action) # 更新环境返回transition\n",
" state = next_state # 更新下一个状态\n",
" ep_reward += reward # 累加奖励\n",
" if done:\n",
" break\n",
" rewards.append(ep_reward)\n",
" if ma_rewards:\n",
" ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n",
" else:\n",
" ma_rewards.append(ep_reward)\n",
" if (i_ep+1)%3 == 0: \n",
" print(f\"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}\")\n",
" print('完成测试!')\n",
" return rewards,ma_rewards\n",
"\n",
"rewards,ma_rewards = eval(cfg,env,agent)\n",
"plot_rewards(rewards,ma_rewards, plot_cfg) # 画出结果\n"
]
}
],
"metadata": {
"interpreter": {
"hash": "fe38df673a99c62a9fea33a7aceda74c9b65b12ee9d076c5851d98b692a4989a"
},
"kernelspec": {
"display_name": "Python 3.7.10 64-bit ('py37': conda)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
},
"metadata": {
"interpreter": {
"hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232"
}
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}