update codes

This commit is contained in:
johnjim0816
2021-11-17 14:36:51 +08:00
parent 8e5090a653
commit 442e307b01
81 changed files with 976 additions and 401 deletions

View File

@@ -0,0 +1,218 @@
# DQN
## 原理简介
DQN是Q-leanning算法的优化和延伸Q-leaning中使用有限的Q表存储值的信息而DQN中则用神经网络替代Q表存储信息这样更适用于高维的情况相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
论文方面主要可以参考两篇一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net也可以叫做Nature DQN。
Nature DQN使用了两个Q网络一个当前Q网络𝑄用来选择动作更新模型参数另一个目标Q网络𝑄用于计算目标Q值。目标Q网络的网络参数不需要迭代更新而是每隔一段时间从当前Q网络𝑄复制过来即延时更新这样可以减少目标Q值和当前的Q值相关性。
要注意的是两个Q网络的结构是一模一样的。这样才可以复制网络参数。Nature DQN和[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)相比除了用一个新的相同结构的目标Q网络来计算目标Q值以外其余部分基本是完全相同的。细节也可参考[强化学习Deep Q-Learning进阶之Nature DQN](https://www.cnblogs.com/pinard/p/9756075.html)。
https://blog.csdn.net/JohnJim0/article/details/109557173)
## 伪代码
<img src="assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png" alt="img" style="zoom:50%;" />
## 代码实现
### RL接口
首先是强化学习训练的基本接口,即通用的训练模式:
```python
for i_episode in range(MAX_EPISODES):
state = env.reset() # reset环境状态
for i_step in range(MAX_STEPS):
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
agent.update() # 每步更新网络
state = next_state # 跳转到下一个状态
if done:
break
```
每个episode加一个MAX_STEPS也可以使用while not done, 加这个max_steps有时是因为比如gym环境训练目标就是在200个step下达到200的reward或者是当完成一个episode的步数较多时也可以设置基本流程跟所有伪代码一致如下
1. agent选择动作
2. 环境根据agent的动作反馈出next_state和reward
3. agent进行更新如有memory就会将transition(包含staterewardaction等)存入memory中
4. 跳转到下一个状态
5. 如果done了就跳出循环进行下一个episode的训练。
想要实现完整的算法还需要创建QnetReplaybuffer等类
### 两个Q网络
上文讲了Nature DQN中有两个Q网络一个是policy_net一个是延时更新的target_net两个网络的结构是一模一样的如下(见```model.py```)注意DQN使用的Qnet就是全连接网络即FCH
```python
import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
def __init__(self, state_dim=4, action_dim=18):
""" 初始化q网络为全连接网络
state_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数
"""
super(FCN, self).__init__()
self.fc1 = nn.Linear(state_dim, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, action_dim) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
```
输入为state_dim输出为action_dim包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
### Replay Buffer
然后就是Replay Memory了其作用主要是是克服经验数据的相关性correlated data和非平稳分布non-stationary distribution问题实现如下(见```memory.py```)
```python
import random
import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = zip(*batch)
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
```
参数capacity表示buffer的容量主要包括push和sample两个步骤push是将transitions放到memory中sample是从memory随机抽取一些transition。
### Agent类
在```agent.py```中我们定义强化学习算法类,包括```choose_action```(选择动作使用e-greedy策略时会多一个```predict```函数,下面会将到)和```update```(更新)等函数。
在类中建立两个网络以及optimizer和memory
```python
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
```
然后是选择action
```python
def choose_action(self, state):
'''选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
action = self.predict(state)
else:
action = random.randrange(self.action_dim)
return action
```
这里使用e-greedy策略即设置一个参数epsilon如果生成的随机数大于epsilon就根据网络预测的选择action否则还是随机选择action这个epsilon是会逐渐减小的可以使用线性或者指数减小的方式但不会减小到零这样在训练稳定时还能保持一定的探索这部分可以学习探索与利用(exploration and exploition)相关知识。
上面讲到的预测函数其实就是根据state选取q值最大的action如下
```python
def predict(self,state):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item()
```
然后是更新函数了:
```python
def update(self):
if len(self.memory) < self.batch_size:
return
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
'''转为张量
例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])'''
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
1) # 例如tensor([[1],...,[0]])
reward_batch = torch.tensor(
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device)
'''计算当前(s_t,a)对应的Q(s_t, a)'''
'''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
q_values = self.policy_net(state_batch).gather(
dim=1, index=action_batch) # 等价于self.forward
# 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states
next_q_values = self.target_net(next_state_batch).max(
1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 expected_q_value
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + \
self.gamma * next_q_values * (1-done_batch)
# self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss
# 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
loss.backward()
# for param in self.policy_net.parameters(): # clip防止梯度爆炸
# param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
```
更新遵循伪代码的以下部分:
<img src="assets/image-20210507162813393.png" alt="image-20210507162813393" style="zoom:50%;" />
首先从replay buffer中选取一个batch的数据计算loss然后进行minibatch SGD。
然后是保存与加载模型的部分,如下:
```python
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)
```
### 实验结果
训练结果如下:
<img src="assets/train_rewards_curve.png" alt="train_rewards_curve" style="zoom: 67%;" />
<img src="assets/eval_rewards_curve.png" alt="eval_rewards_curve" style="zoom:67%;" />
## 参考
[with torch.no_grad()](https://www.jianshu.com/p/1cea017f5d11)

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2021-09-15 13:35:36
@Discription:
@Environment: python 3.7.7
'''
'''off-policy
'''
import torch
import torch.nn as nn
import torch.optim as optim
import random
import math
import numpy as np
from common.memory import ReplayBuffer
from common.model import MLP
class DQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
def choose_action(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
# 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
# 优化更新模型
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-09-15 15:34:13
@Discription:
@Environment: python 3.7.7
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加父路径到系统路径sys.path
import gym
import torch
import datetime
from common.utils import save_results, make_dir
from common.plot import plot_rewards
from DQN.agent import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class DQNConfig:
def __init__(self):
self.algo = "DQN" # 算法名称
self.env = 'CartPole-v0' # 环境名称
self.train_eps = 200 # 训练的回合数
self.eval_eps = 30 # 测试的回合数
self.gamma = 0.95 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.0001 # 学习率
self.memory_capacity = 100000 # 经验回放的容量
self.batch_size = 64 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.hidden_dim = 256 # 网络隐藏层
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env = 'CartPole-v0' # 环境名称
self.result_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
def env_agent_config(cfg,seed=1):
''' 创建环境和智能体
'''
env = gym.make(cfg.env) # 创建环境
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
agent = DQN(n_states,n_actions,cfg) # 创建智能体
return env,agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
def eval(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.eval_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards,ma_rewards, plot_cfg, tag="eval") # 画出结果

View File

@@ -0,0 +1,39 @@
食用本篇之前需要有DQN算法的基础参考[DQN算法实战](../DQN)。
## 原理简介
Double-DQN是2016年提出的算法灵感源自2010年的Double-Qlearning可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)。
跟Nature DQN一样Double-DQN也用了两个网络一个当前网络(对应用$Q$表示),一个目标网络(对应一般用$Q'$表示,为方便区分,以下用$Q_{tar}$代替)。我们先回忆一下,对于非终止状态,目标$Q_{tar}$值计算如下
![在这里插入图片描述](assets/20201222145725907.png)
而在Double-DQN中不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值,而是先从当前$Q$网络选择$Q$值最大对应的动作,然后代入到目标网络中计算对应的值:
![在这里插入图片描述](assets/20201222150225327.png)
Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢但是很容易过犹不及导致过度估计(Over Estimation),所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题, DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步来达到消除过度估计的问题感兴趣可以阅读原论文。
伪代码如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png)
当然也可以两个网络可以同时为当前网络和目标网络,如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png)
或者这样更好理解如何同时为当前网络和目标网络:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png)
## 代码实战
完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理其实Double DQN改进来很简单基本只需要在```update```中修改几行代码,如下:
```python
'''以下是Nature DQN的q_target计算方式
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # # 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,])
#计算 q_target
#对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQNq_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
#选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
```
reward变化结果如下
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png)
其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2021-05-04 22:28:06
@Discription:
@Environment: python 3.7.7
'''
'''off-policy
'''
import torch
import torch.nn as nn
import torch.optim as optim
import random
import math
import numpy as np
from common.memory import ReplayBuffer
from common.model import MLP
class DoubleDQN:
def __init__(self, state_dim, action_dim, cfg):
self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
self.actions_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net copy from policy_net
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(param.data)
# self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
# 可查parameters()与state_dict()的区别前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0
self.memory = ReplayBuffer(cfg.memory_capacity)
def predict(self,state):
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
return action
def choose_action(self, state):
'''选择动作
'''
self.actions_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
if random.random() > self.epsilon:
action = self.predict(state)
else:
action = random.randrange(self.action_dim)
return action
def update(self):
if len(self.memory) < self.batch_size:
return
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# convert to tensor
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
1) # 例如tensor([[1],...,[0]])
reward_batch = torch.tensor(
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device) # 将bool转为float然后转为张量
# 计算当前(s_t,a)对应的Q(s_t, a)
q_values = self.policy_net(state_batch)
next_q_values = self.policy_net(next_state_batch)
# 代入当前选择的action得到Q(s_t|a=a_t)
q_value = q_values.gather(dim=1, index=action_batch)
'''以下是Nature DQN的q_target计算方式
# 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 q_target
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQN q_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
# 选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
# 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
self.loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
def save(self,path):
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
def load(self,path):
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

View File

@@ -0,0 +1,194 @@
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3710jvsc74a57bd0366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232",
"display_name": "Python 3.7.10 64-bit ('py37': conda)"
},
"metadata": {
"interpreter": {
"hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232"
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"curr_path = str(Path().absolute())\n",
"parent_path = str(Path().absolute().parent)\n",
"sys.path.append(parent_path) # add current terminal path to sys.path"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import torch\n",
"import datetime\n",
"from DoubleDQN.agent import DoubleDQN\n",
"from common.plot import plot_rewards\n",
"from common.utils import save_results, make_dir\n",
"\n",
"curr_time = datetime.datetime.now().strftime(\n",
" \"%Y%m%d-%H%M%S\") # obtain current time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class DoubleDQNConfig:\n",
" def __init__(self):\n",
" self.algo = \"DoubleDQN\" # name of algo\n",
" self.env = 'CartPole-v0' # env name\n",
" self.result_path = curr_path+\"/outputs/\" + self.env + \\\n",
" '/'+curr_time+'/results/' # path to save results\n",
" self.model_path = curr_path+\"/outputs/\" + self.env + \\\n",
" '/'+curr_time+'/models/' # path to save models\n",
" self.train_eps = 200 # max tranng episodes\n",
" self.eval_eps = 50 # max evaling episodes\n",
" self.gamma = 0.95\n",
" self.epsilon_start = 1 # start epsilon of e-greedy policy\n",
" self.epsilon_end = 0.01 \n",
" self.epsilon_decay = 500\n",
" self.lr = 0.001 # learning rate\n",
" self.memory_capacity = 100000 # capacity of Replay Memory\n",
" self.batch_size = 64\n",
" self.target_update = 2 # update frequency of target net\n",
" self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # check gpu\n",
" self.hidden_dim = 256 # hidden size of net"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def env_agent_config(cfg,seed=1):\n",
" env = gym.make(cfg.env) \n",
" env.seed(seed)\n",
" state_dim = env.observation_space.shape[0]\n",
" action_dim = env.action_space.n\n",
" agent = DoubleDQN(state_dim,action_dim,cfg)\n",
" return env,agent"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def train(cfg,env,agent):\n",
" print('Start to train !')\n",
" rewards,ma_rewards = [],[]\n",
" for i_ep in range(cfg.train_eps):\n",
" state = env.reset() \n",
" ep_reward = 0\n",
" while True:\n",
" action = agent.choose_action(state) \n",
" next_state, reward, done, _ = env.step(action)\n",
" ep_reward += reward\n",
" agent.memory.push(state, action, reward, next_state, done) \n",
" state = next_state \n",
" agent.update() \n",
" if done:\n",
" break\n",
" if i_ep % cfg.target_update == 0:\n",
" agent.target_net.load_state_dict(agent.policy_net.state_dict())\n",
" if (i_ep+1)%10 == 0:\n",
" print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}')\n",
" rewards.append(ep_reward)\n",
" if ma_rewards:\n",
" ma_rewards.append(\n",
" 0.9*ma_rewards[-1]+0.1*ep_reward)\n",
" else:\n",
" ma_rewards.append(ep_reward) \n",
" print('Complete training')\n",
" return rewards,ma_rewards"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def eval(cfg,env,agent):\n",
" print('Start to eval !')\n",
" print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n",
" rewards = [] \n",
" ma_rewards = []\n",
" for i_ep in range(cfg.eval_eps):\n",
" state = env.reset() \n",
" ep_reward = 0 \n",
" while True:\n",
" action = agent.predict(state) \n",
" next_state, reward, done, _ = env.step(action) \n",
" state = next_state \n",
" ep_reward += reward\n",
" if done:\n",
" break\n",
" rewards.append(ep_reward)\n",
" if ma_rewards:\n",
" ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n",
" else:\n",
" ma_rewards.append(ep_reward)\n",
" print(f\"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}\")\n",
" print('Complete evaling')\n",
" return rewards,ma_rewards "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if __name__ == \"__main__\":\n",
" cfg = DoubleDQNConfig()\n",
" # train\n",
" env,agent = env_agent_config(cfg,seed=1)\n",
" rewards, ma_rewards = train(cfg, env, agent)\n",
" make_dir(cfg.result_path, cfg.model_path)\n",
" agent.save(path=cfg.model_path)\n",
" save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)\n",
" plot_rewards(rewards, ma_rewards, tag=\"train\",\n",
" algo=cfg.algo, path=cfg.result_path)\n",
"\n",
" # eval\n",
" env,agent = env_agent_config(cfg,seed=10)\n",
" agent.load(path=cfg.model_path)\n",
" rewards,ma_rewards = eval(cfg,env,agent)\n",
" save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)\n",
" plot_rewards(rewards,ma_rewards,tag=\"eval\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)"
]
}
]
}

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-09-10 15:26:05
@Discription:
@Environment: python 3.7.7
'''
import sys,os
curr_path = os.path.dirname(__file__)
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import gym
import torch
import datetime
from DoubleDQN.agent import DoubleDQN
from common.plot import plot_rewards
from common.utils import save_results, make_dir
curr_time = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
class DoubleDQNConfig:
def __init__(self):
self.algo = "DoubleDQN" # name of algo
self.env = 'CartPole-v0' # env name
self.result_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/results/' # path to save results
self.model_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/models/' # path to save models
self.train_eps = 200 # max tranng episodes
self.eval_eps = 50 # max evaling episodes
self.gamma = 0.95
self.epsilon_start = 1 # start epsilon of e-greedy policy
self.epsilon_end = 0.01
self.epsilon_decay = 500
self.lr = 0.001 # learning rate
self.memory_capacity = 100000 # capacity of Replay Memory
self.batch_size = 64
self.target_update = 2 # update frequency of target net
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check gpu
self.hidden_dim = 256 # hidden size of net
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DoubleDQN(state_dim,action_dim,cfg)
return env,agent
def train(cfg,env,agent):
print('Start to train !')
rewards,ma_rewards = [],[]
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
state = next_state
agent.update()
if done:
break
if i_ep % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward},Epsilon:{agent.epsilon:.2f}')
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Complete training')
return rewards,ma_rewards
def eval(cfg,env,agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.eval_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.predict(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}")
print('Complete evaling')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DoubleDQNConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=cfg.result_path)
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,13 @@
# Hierarchical DQN
## 原理简介
Hierarchical DQN是一种分层强化学习方法与DQN相比增加了一个meta controller
![image-20210331153115575](assets/image-20210331153115575.png)
即学习时meta controller每次会生成一个goal然后controller或者说下面的actor就会达到这个goal直到done为止。这就相当于给agent增加了一个队长队长擅长制定局部目标指导agent前行这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
## 伪代码
![image-20210331153542314](assets/image-20210331153542314.png)

View File

@@ -0,0 +1,115 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:18:18
LastEditor: John
LastEditTime: 2021-05-04 22:39:34
Discription:
Environment:
'''
import torch
import torch.nn as nn
import numpy as np
import random,math
import torch.optim as optim
from common.model import MLP
from common.memory import ReplayBuffer
class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = cfg.gamma
self.device = cfg.device
self.batch_size = cfg.batch_size
self.frame_idx = 0
self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
self.meta_memory = ReplayBuffer(cfg.memory_capacity)
self.loss_numpy = 0
self.meta_loss_numpy = 0
self.losses = []
self.meta_losses = []
def to_onehot(self,x):
oh = np.zeros(self.state_dim)
oh[x - 1] = 1.
return oh
def set_goal(self,state):
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
goal = self.meta_policy_net(state).max(1)[1].item()
else:
goal = random.randrange(self.state_dim)
return goal
def choose_action(self,state):
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
return action
def update(self):
self.update_policy()
self.update_meta()
def update_policy(self):
if self.batch_size > len(self.memory):
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float)
action_batch = torch.tensor(action_batch,device=self.device,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch,device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch),device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values)
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
self.loss_numpy = loss.detach().cpu().numpy()
self.losses.append(self.loss_numpy)
def update_meta(self):
if self.batch_size > len(self.meta_memory):
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float)
action_batch = torch.tensor(action_batch,device=self.device,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch,device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch),device=self.device)
q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
meta_loss = nn.MSELoss()(q_values, expected_q_values)
self.meta_optimizer.zero_grad()
meta_loss.backward()
for param in self.meta_policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.meta_optimizer.step()
self.meta_loss_numpy = meta_loss.detach().cpu().numpy()
self.meta_losses.append(self.meta_loss_numpy)
def save(self, path):
torch.save(self.policy_net.state_dict(), path+'policy_checkpoint.pth')
torch.save(self.meta_policy_net.state_dict(), path+'meta_checkpoint.pth')
def load(self, path):
self.policy_net.load_state_dict(torch.load(path+'policy_checkpoint.pth'))
self.meta_policy_net.load_state_dict(torch.load(path+'meta_checkpoint.pth'))

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,146 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-29 10:37:32
LastEditor: John
LastEditTime: 2021-05-04 22:35:56
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(__file__)
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import datetime
import numpy as np
import torch
import gym
from common.utils import save_results,make_dir
from common.plot import plot_rewards
from HierarchicalDQN.agent import HierarchicalDQN
curr_time = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
class HierarchicalDQNConfig:
def __init__(self):
self.algo = "H-DQN" # name of algo
self.env = 'CartPole-v0'
self.result_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/results/' # path to save results
self.model_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/models/' # path to save models
self.train_eps = 300 # 训练的episode数目
self.eval_eps = 50 # 测试的episode数目
self.gamma = 0.99
self.epsilon_start = 1 # start epsilon of e-greedy policy
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.0001 # learning rate
self.memory_capacity = 10000 # Replay Memory capacity
self.batch_size = 32
self.target_update = 2 # target net的更新频率
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 256 # dimension of hidden layer
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg)
return env,agent
def train(cfg, env, agent):
print('Start to train !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = [] # moveing average reward
for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
goal = agent.set_goal(state)
onehot_goal = agent.to_onehot(goal)
meta_state = state
extrinsic_reward = 0
while not done and goal != np.argmax(state):
goal_state = np.concatenate([state, onehot_goal])
action = agent.choose_action(goal_state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
extrinsic_reward += reward
intrinsic_reward = 1.0 if goal == np.argmax(
next_state) else 0.0
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(
[next_state, onehot_goal]), done)
state = next_state
agent.update()
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Complete training')
return rewards, ma_rewards
def eval(cfg, env, agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = [] # moveing average reward
for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
goal = agent.set_goal(state)
onehot_goal = agent.to_onehot(goal)
extrinsic_reward = 0
while not done and goal != np.argmax(state):
goal_state = np.concatenate([state, onehot_goal])
action = agent.choose_action(goal_state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
extrinsic_reward += reward
state = next_state
agent.update()
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}, Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}')
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Complete training')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = HierarchicalDQNConfig()
# train
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=cfg.result_path)
# eval
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

View File

@@ -0,0 +1,25 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from pathlib import Path\n",
"curr_path = str(Path().absolute()) # 当前路径\n",
"parent_path = str(Path().absolute().parent) # 父路径\n",
"sys.path.append(parent_path) # 添加路径到系统路径"
]
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,3 @@
本目录下汇总了基础的DQN及其变种或升级如下