This commit is contained in:
JohnJim0816
2021-03-28 11:18:52 +08:00
parent 2df8d965d2
commit 6e4d966e1f
56 changed files with 497 additions and 165 deletions

View File

@@ -30,23 +30,7 @@
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | | | [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
## 算法实战 ## 算法实战
| 算法名称 | 相关论文材料 | 环境 | 备注 | [点击](./codes)或者跳转```codes```文件夹下进入算法实战
| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
| [On-Policy First-Visit MC](./codes/MonteCarlo) | | [Racetrack](./codes/envs/racetrack_env.md) | |
| [Q-Learning](./codes/QLearning) | | [CliffWalking-v0](./codes/envs/gym_info.md) | |
| [Sarsa](./codes/Sarsa) | | [Racetrack](./codes/envs/racetrack_env.md) | |
| [DQN](./codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | |
| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DoubleDQN](./codes/DoubleDQN) | | [CartPole-v0](./codes/envs/gym_info.md) | 效果不好,待改进 |
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
| [PolicyGradient](./codes/PolicyGradient) | | [CartPole-v0](./codes/envs/gym_info.md) | |
| A2C | | [CartPole-v0](./codes/envs/gym_info.md) | |
| A3C | | | |
| SAC | | | |
| [PPO](./codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./codes/envs/gym_info.md) | |
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./codes/envs/gym_info.md) | |
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
| GAIL | | | |
## 贡献者 ## 贡献者

View File

@@ -13,9 +13,9 @@ from A2C.model import ActorCritic
import torch.optim as optim import torch.optim as optim
class A2C: class A2C:
def __init__(self,n_states, n_actions, cfg): def __init__(self,state_dim, action_dim, cfg):
self.gamma = 0.99 self.gamma = 0.99
self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device) self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device)
self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr) self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
def choose_action(self, state): def choose_action(self, state):
dist, value = self.model(state) dist, value = self.model(state)

View File

@@ -95,8 +95,8 @@ if __name__ == "__main__":
cfg = A2CConfig() cfg = A2CConfig()
env = gym.make('CartPole-v0') env = gym.make('CartPole-v0')
env.seed(1) # set random seed for env env.seed(1) # set random seed for env
n_states = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n action_dim = env.action_space.n
agent = A2C(n_states, n_actions, cfg) agent = A2C(state_dim, action_dim, cfg)
train(cfg,env,agent) train(cfg,env,agent)

View File

@@ -13,18 +13,18 @@ import torch.nn as nn
from torch.distributions import Categorical from torch.distributions import Categorical
class ActorCritic(nn.Module): class ActorCritic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256): def __init__(self, state_dim, action_dim, hidden_dim=256):
super(ActorCritic, self).__init__() super(ActorCritic, self).__init__()
self.critic = nn.Sequential( self.critic = nn.Sequential(
nn.Linear(n_states, hidden_dim), nn.Linear(state_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, 1) nn.Linear(hidden_dim, 1)
) )
self.actor = nn.Sequential( self.actor = nn.Sequential(
nn.Linear(n_states, hidden_dim), nn.Linear(state_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, n_actions), nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=1), nn.Softmax(dim=1),
) )

View File

@@ -19,12 +19,12 @@ from common.memory import ReplayBuffer
class DDPG: class DDPG:
def __init__(self, n_states, n_actions, cfg): def __init__(self, state_dim, action_dim, cfg):
self.device = cfg.device self.device = cfg.device
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(param.data) target_param.data.copy_(param.data)

View File

@@ -41,17 +41,17 @@ class OUNoise(object):
self.max_sigma = max_sigma self.max_sigma = max_sigma
self.min_sigma = min_sigma self.min_sigma = min_sigma
self.decay_period = decay_period self.decay_period = decay_period
self.n_actions = action_space.shape[0] self.action_dim = action_space.shape[0]
self.low = action_space.low self.low = action_space.low
self.high = action_space.high self.high = action_space.high
self.reset() self.reset()
def reset(self): def reset(self):
self.obs = np.ones(self.n_actions) * self.mu self.obs = np.ones(self.action_dim) * self.mu
def evolve_obs(self): def evolve_obs(self):
x = self.obs x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
self.obs = x + dx self.obs = x + dx
return self.obs return self.obs

View File

@@ -82,9 +82,9 @@ if __name__ == "__main__":
cfg = DDPGConfig() cfg = DDPGConfig()
env = NormalizedActions(gym.make("Pendulum-v0")) env = NormalizedActions(gym.make("Pendulum-v0"))
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0] action_dim = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg) agent = DDPG(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent) rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH) agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)

View File

@@ -46,15 +46,15 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
class FCN(nn.Module): class FCN(nn.Module):
def __init__(self, n_states=4, n_actions=18): def __init__(self, state_dim=4, action_dim=18):
""" 初始化q网络为全连接网络 """ 初始化q网络为全连接网络
n_states: 输入的feature即环境的state数目 state_dim: 输入的feature即环境的state数目
n_actions: 输出的action总个数 action_dim: 输出的action总个数
""" """
super(FCN, self).__init__() super(FCN, self).__init__()
self.fc1 = nn.Linear(n_states, 128) # 输入层 self.fc1 = nn.Linear(state_dim, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层 self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) # 输出层 self.fc3 = nn.Linear(128, action_dim) # 输出层
def forward(self, x): def forward(self, x):
# 各层对应的激活函数 # 各层对应的激活函数
@@ -66,8 +66,8 @@ class FCN(nn.Module):
在```agent.py```中我们定义强化学习算法,包括```choose_action```和```update```两个主要函数,初始化中: 在```agent.py```中我们定义强化学习算法,包括```choose_action```和```update```两个主要函数,初始化中:
```python ```python
self.policy_net = FCN(n_states, n_actions).to(self.device) self.policy_net = FCN(state_dim, action_dim).to(self.device)
self.target_net = FCN(n_states, n_actions).to(self.device) self.target_net = FCN(state_dim, action_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net # target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.target_net.eval() # 不启用 BatchNormalization 和 Dropout

View File

@@ -20,11 +20,11 @@ import random
import math import math
import numpy as np import numpy as np
from common.memory import ReplayBuffer from common.memory import ReplayBuffer
from common.model import MLP2 from common.model import MLP
class DQN: class DQN:
def __init__(self, n_states, n_actions, cfg): def __init__(self, state_dim, action_dim, cfg):
self.n_actions = n_actions # 总的动作个数 self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子 self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数 # e-greedy策略相关参数
@@ -34,8 +34,8 @@ class DQN:
self.epsilon_end = cfg.epsilon_end self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net # target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
@@ -64,7 +64,7 @@ class DQN:
# 所以tensor.max(1)[1]返回最大值对应的下标即action # 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item() action = q_value.max(1)[1].item()
else: else:
action = random.randrange(self.n_actions) action = random.randrange(self.action_dim)
return action return action
else: else:
with torch.no_grad(): # 取消保存梯度 with torch.no_grad(): # 取消保存梯度

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57 @Date: 2020-06-12 00:48:57
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-17 20:35:37 LastEditTime: 2021-03-26 17:17:17
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -40,7 +40,7 @@ class DQNConfig:
self.lr = 0.01 # 学习率 self.lr = 0.01 # 学习率
self.memory_capacity = 800 # Replay Memory容量 self.memory_capacity = 800 # Replay Memory容量
self.batch_size = 64 self.batch_size = 64
self.train_eps = 250 # 训练的episode数目 self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度 self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率 self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目 self.eval_eps = 20 # 测试的episode数目
@@ -84,9 +84,9 @@ if __name__ == "__main__":
cfg = DQNConfig() cfg = DQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n action_dim = env.action_space.n
agent = DQN(n_states,n_actions,cfg) agent = DQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent) rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH) agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

2
codes/DQN_cnn/README.md Normal file
View File

@@ -0,0 +1,2 @@
# DQN with cnn
原理与[DQN](../DQN)相同,只是将神经网络换成卷积神经网络,用于二维观测信息(state或obervation)

39
codes/DoubleDQN/README.md Normal file
View File

@@ -0,0 +1,39 @@
食用本篇之前需要有DQN算法的基础参考[DQN算法实战](../DQN)。
## 原理简介
Double-DQN是2016年提出的算法灵感源自2010年的Double-Qlearning可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)。
跟Nature DQN一样Double-DQN也用了两个网络一个当前网络(对应用$Q$表示),一个目标网络(对应一般用$Q'$表示,为方便区分,以下用$Q_{tar}$代替)。我们先回忆一下,对于非终止状态,目标$Q_{tar}$值计算如下
![在这里插入图片描述](assets/20201222145725907.png)
而在Double-DQN中不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值,而是先从当前$Q$网络选择$Q$值最大对应的动作,然后代入到目标网络中计算对应的值:
![在这里插入图片描述](assets/20201222150225327.png)
Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢但是很容易过犹不及导致过度估计(Over Estimation),所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题, DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步来达到消除过度估计的问题感兴趣可以阅读原论文。
伪代码如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png)
当然也可以两个网络可以同时为当前网络和目标网络,如下:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png)
或者这样更好理解如何同时为当前网络和目标网络:
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png)
## 代码实战
完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理其实Double DQN改进来很简单基本只需要在```update```中修改几行代码,如下:
```python
'''以下是Nature DQN的q_target计算方式
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # # 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,])
#计算 q_target
#对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQNq_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
#选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
```
reward变化结果如下
![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png)
其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-13 15:01:27 LastEditTime: 2021-03-28 11:07:35
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -16,16 +16,15 @@ LastEditTime: 2021-03-13 15:01:27
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import torch.nn.functional as F
import random import random
import math import math
import numpy as np import numpy as np
from common.memory import ReplayBuffer from common.memory import ReplayBuffer
from common.model import MLP2 from common.model import MLP
class DoubleDQN: class DoubleDQN:
def __init__(self, n_states, n_actions, cfg): def __init__(self, state_dim, action_dim, cfg):
self.n_actions = n_actions # 总的动作个数 self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma self.gamma = cfg.gamma
# e-greedy策略相关参数 # e-greedy策略相关参数
@@ -34,8 +33,8 @@ class DoubleDQN:
self.epsilon_end = cfg.epsilon_end self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net # target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
@@ -63,7 +62,7 @@ class DoubleDQN:
# 所以tensor.max(1)[1]返回最大值对应的下标即action # 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item() action = q_value.max(1)[1].item()
else: else:
action = random.randrange(self.n_actions) action = random.randrange(self.action_dim)
return action return action
def update(self): def update(self):

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57 @Date: 2020-06-12 00:48:57
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-17 20:11:19 LastEditTime: 2021-03-28 11:05:14
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -32,7 +32,7 @@ if not os.path.exists(RESULT_PATH):
class DoubleDQNConfig: class DoubleDQNConfig:
def __init__(self): def __init__(self):
self.algo = "Double DQN" # 算法名称 self.algo = "Double DQN" # name of algo
self.gamma = 0.99 self.gamma = 0.99
self.epsilon_start = 0.9 # e-greedy策略的初始epsilon self.epsilon_start = 0.9 # e-greedy策略的初始epsilon
self.epsilon_end = 0.01 self.epsilon_end = 0.01
@@ -40,7 +40,7 @@ class DoubleDQNConfig:
self.lr = 0.01 # 学习率 self.lr = 0.01 # 学习率
self.memory_capacity = 10000 # Replay Memory容量 self.memory_capacity = 10000 # Replay Memory容量
self.batch_size = 128 self.batch_size = 128
self.train_eps = 250 # 训练的episode数目 self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度 self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率 self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目 self.eval_eps = 20 # 测试的episode数目
@@ -84,9 +84,9 @@ if __name__ == "__main__":
cfg = DoubleDQNConfig() cfg = DoubleDQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n action_dim = env.action_space.n
agent = DoubleDQN(n_states,n_actions,cfg) agent = DoubleDQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent) rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH) agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

View File

@@ -0,0 +1,102 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:18:18
LastEditor: John
LastEditTime: 2021-03-27 04:24:30
Discription:
Environment:
'''
import torch
import torch.nn as nn
import numpy as np
import random,math
from HierarchicalDQN.model import MLP
from common.memory import ReplayBuffer
import torch.optim as optim
class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg):
self.action_dim = action_dim
self.device = cfg.device
self.batch_size = cfg.batch_size
self.sample_count = 0
self.epsilon = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
self.meta_memory = ReplayBuffer(cfg.memory_capacity)
def to_onehot(x):
oh = np.zeros(6)
oh[x - 1] = 1.
return oh
def set_goal(self,meta_state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32)
q_value = self.policy_net(meta_state)
goal = q_value.max(1)[1].item()
else:
goal = random.randrange(self.action_dim)
goal = self.meta_policy_net(meta_state)
onehot_goal = self.to_onehot(goal)
return onehot_goal
def choose_action(self,state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
return action
def update(self):
if self.batch_size > len(self.memory):
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch)
next_state_values = self.target_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
if self.batch_size > len(self.meta_memory):
meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size)
meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float)
meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1)
meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float)
next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float)
meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1)
meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch)
next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach()
expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0])
meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1))
self.meta_optimizer.zero_grad()
meta_loss.backward()
for param in self.meta_policy_net.parameters():
param.grad.data.clamp_(-1, 1)
self.meta_optimizer.step()

View File

@@ -0,0 +1,97 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:04
LastEditor: John
LastEditTime: 2021-03-27 04:23:43
Discription:
Environment:
'''
import sys,os
sys.path.append(os.getcwd()) # add current terminal path to sys.path
import gym
import numpy as np
import torch
import datetime
from HierarchicalDQN.agent import HierarchicalDQN
from common.plot import plot_rewards
from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class HierarchicalDQNConfig:
def __init__(self):
self.algo = "DQN" # name of algo
self.gamma = 0.99
self.epsilon_start = 0.95 # start epsilon of e-greedy policy
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.01 # learning rate
self.memory_capacity = 800 # Replay Memory capacity
self.batch_size = 64
self.train_eps = 250 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 256 # dimension of hidden layer
def train(cfg,env,agent):
print('Start to train !')
rewards = []
ma_rewards = [] # moving average reward
ep_steps = []
for i_episode in range(cfg.train_eps):
state = env.reset()
extrinsic_reward = 0
for i_step in range(cfg.train_steps):
goal= agent.set_goal(state)
meta_state = state
goal_state = np.concatenate([state, goal])
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
extrinsic_reward += reward
intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done)
state = next_state
agent.update()
if done:
break
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done))
ep_steps.append(i_step)
rewards.append(extrinsic_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*extrinsic_reward)
else:
ma_rewards.append(extrinsic_reward)
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
print('Complete training')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = HierarchicalDQNConfig()
env = gym.make('CartPole-v0')
env.seed(1)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)

View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:12
LastEditor: John
LastEditTime: 2021-03-24 22:17:09
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

View File

@@ -16,11 +16,11 @@ import torch
class FisrtVisitMC: class FisrtVisitMC:
''' On-Policy First-Visit MC Control ''' On-Policy First-Visit MC Control
''' '''
def __init__(self,n_actions,cfg): def __init__(self,action_dim,cfg):
self.n_actions = n_actions self.action_dim = action_dim
self.epsilon = cfg.epsilon self.epsilon = cfg.epsilon
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.Q = defaultdict(lambda: np.zeros(n_actions)) self.Q = defaultdict(lambda: np.zeros(action_dim))
self.returns_sum = defaultdict(float) # sum of returns self.returns_sum = defaultdict(float) # sum of returns
self.returns_count = defaultdict(float) self.returns_count = defaultdict(float)
@@ -28,11 +28,11 @@ class FisrtVisitMC:
''' e-greed policy ''' ''' e-greed policy '''
if state in self.Q.keys(): if state in self.Q.keys():
best_action = np.argmax(self.Q[state]) best_action = np.argmax(self.Q[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
action_probs[best_action] += (1.0 - self.epsilon) action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
else: else:
action = np.random.randint(0,self.n_actions) action = np.random.randint(0,self.action_dim)
return action return action
def update(self,one_ep_transition): def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition # Find all (state, action) pairs we've visited in this one_ep_transition

View File

@@ -79,8 +79,8 @@ def mc_train(cfg,env,agent):
if __name__ == "__main__": if __name__ == "__main__":
mc_cfg = MCConfig() mc_cfg = MCConfig()
env = RacetrackEnv() env = RacetrackEnv()
n_actions=9 action_dim=9
agent = FisrtVisitMC(n_actions,mc_cfg) agent = FisrtVisitMC(action_dim,mc_cfg)
rewards,ma_rewards= mc_train(mc_cfg,env,agent) rewards,ma_rewards= mc_train(mc_cfg,env,agent)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH) plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH)

View File

@@ -17,9 +17,9 @@ from PolicyGradient.model import MLP
class PolicyGradient: class PolicyGradient:
def __init__(self, n_states,cfg): def __init__(self, state_dim,cfg):
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim) self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size

View File

@@ -80,9 +80,9 @@ if __name__ == "__main__":
cfg = PGConfig() cfg = PGConfig()
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0') # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n action_dim = env.action_space.n
agent = PolicyGradient(n_states,cfg) agent = PolicyGradient(state_dim,cfg)
rewards, ma_rewards = train(cfg,env,agent) rewards, ma_rewards = train(cfg,env,agent)
agent.save_model(SAVED_MODEL_PATH) agent.save_model(SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)

View File

@@ -16,10 +16,10 @@ class MLP(nn.Module):
输入state维度 输入state维度
输出:概率 输出:概率
''' '''
def __init__(self,n_states,hidden_dim = 36): def __init__(self,state_dim,hidden_dim = 36):
super(MLP, self).__init__() super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据state_dim, n_actions的情况来改变 # 24和36为hidden layer的层数可根据state_dim, action_dim的情况来改变
self.fc1 = nn.Linear(n_states, hidden_dim) self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim) self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00 Date: 2020-09-11 23:03:00
LastEditor: John LastEditor: John
LastEditTime: 2021-03-12 16:48:25 LastEditTime: 2021-03-26 16:51:01
Discription: Discription:
Environment: Environment:
''' '''
@@ -16,39 +16,35 @@ from collections import defaultdict
class QLearning(object): class QLearning(object):
def __init__(self, def __init__(self,
n_actions,cfg): action_dim,cfg):
self.n_actions = n_actions # number of actions self.action_dim = action_dim # dimension of acgtion
self.lr = cfg.lr # learning rate self.lr = cfg.lr # learning rate
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.epsilon = 0 self.epsilon = 0
self.sample_count = 0 # epsilon随训练的也就是采样次数逐渐衰减所以需要计数 self.sample_count = 0
self.epsilon_start = cfg.epsilon_start self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay self.epsilon_decay = cfg.epsilon_decay
self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 使用字典存储Q表个人比较喜欢这种也可以用下面一行的二维数组表示但是需要额外更改代码 self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value)
# self.Q_table = np.zeros((n_states, n_actions)) # Q表
def choose_action(self, state): def choose_action(self, state):
self.sample_count += 1 self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay) math.exp(-1. * self.sample_count / self.epsilon_decay)
# 随机选取0-1之间的值如果大于epsilon就按照贪心策略选取action否则随机选取 # e-greedy policy
if np.random.uniform(0, 1) > self.epsilon: if np.random.uniform(0, 1) > self.epsilon:
action = np.argmax(self.Q_table[state]) action = np.argmax(self.Q_table[str(state)])
else: else:
action = np.random.choice(self.n_actions) # 有一定概率随机探索选取一个动作 action = np.random.choice(self.action_dim)
return action return action
def update(self, state, action, reward, next_state, done): def update(self, state, action, reward, next_state, done):
Q_predict = self.Q_table[state][action] Q_predict = self.Q_table[str(state)][action]
if done: if done:
Q_target = reward # terminal state Q_target = reward # terminal state
else: else:
Q_target = reward + self.gamma * np.max( Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)])
self.Q_table[next_state]) # Q_table-learning self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
self.Q_table[state][action] += self.lr * (Q_target - Q_predict)
def save(self,path): def save(self,path):
'''把 Q表格 的数据保存到文件中
'''
import dill import dill
torch.save( torch.save(
obj=self.Q_table, obj=self.Q_table,
@@ -56,7 +52,5 @@ class QLearning(object):
pickle_module=dill pickle_module=dill
) )
def load(self, path): def load(self, path):
'''从文件中读取数据到 Q表格
'''
import dill import dill
self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill) self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00 Date: 2020-09-11 23:03:00
LastEditor: John LastEditor: John
LastEditTime: 2021-03-12 21:16:50 LastEditTime: 2021-03-26 17:16:07
Discription: Discription:
Environment: Environment:
''' '''
@@ -35,20 +35,18 @@ if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
class QlearningConfig: class QlearningConfig:
'''训练相关参数''' '''训练相关参数'''
def __init__(self): def __init__(self):
self.n_episodes = 200 # 训练的episode数目 self.train_eps = 200 # 训练的episode数目
self.gamma = 0.9 # reward的衰减率 self.gamma = 0.9 # reward的衰减率
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率 self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率
self.lr = 0.1 # 学习率 self.lr = 0.1 # learning rate
def train(cfg,env,agent): def train(cfg,env,agent):
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up rewards = []
# env = FrozenLakeWapper(env) ma_rewards = [] # moving average reward
rewards = [] # 记录所有episode的reward
ma_rewards = [] # 滑动平均的reward
steps = [] # 记录所有episode的steps steps = [] # 记录所有episode的steps
for i_episode in range(cfg.n_episodes): for i_episode in range(cfg.train_eps):
ep_reward = 0 # 记录每个episode的reward ep_reward = 0 # 记录每个episode的reward
ep_steps = 0 # 记录每个episode走了多少step ep_steps = 0 # 记录每个episode走了多少step
state = env.reset() # 重置环境, 重新开一局即开始新的一个episode state = env.reset() # 重置环境, 重新开一局即开始新的一个episode
@@ -63,12 +61,11 @@ def train(cfg,env,agent):
break break
steps.append(ep_steps) steps.append(ep_steps)
rewards.append(ep_reward) rewards.append(ep_reward)
# 计算滑动平均的reward
if ma_rewards: if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward)) print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward))
return rewards,ma_rewards return rewards,ma_rewards
def eval(cfg,env,agent): def eval(cfg,env,agent):
@@ -77,7 +74,7 @@ def eval(cfg,env,agent):
rewards = [] # 记录所有episode的reward rewards = [] # 记录所有episode的reward
ma_rewards = [] # 滑动平均的reward ma_rewards = [] # 滑动平均的reward
steps = [] # 记录所有episode的steps steps = [] # 记录所有episode的steps
for i_episode in range(cfg.n_episodes): for i_episode in range(cfg.train_eps):
ep_reward = 0 # 记录每个episode的reward ep_reward = 0 # 记录每个episode的reward
ep_steps = 0 # 记录每个episode走了多少step ep_steps = 0 # 记录每个episode走了多少step
state = env.reset() # 重置环境, 重新开一局即开始新的一个episode state = env.reset() # 重置环境, 重新开一局即开始新的一个episode
@@ -96,15 +93,15 @@ def eval(cfg,env,agent):
ma_rewards.append(rewards[-1]*0.9+ep_reward*0.1) ma_rewards.append(rewards[-1]*0.9+ep_reward*0.1)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward)) print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward))
return rewards,ma_rewards return rewards,ma_rewards
if __name__ == "__main__": if __name__ == "__main__":
cfg = QlearningConfig() cfg = QlearningConfig()
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env) env = CliffWalkingWapper(env)
n_actions = env.action_space.n action_dim = env.action_space.n
agent = QLearning(n_actions,cfg) agent = QLearning(action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent) rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH) agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

57
codes/README_en.md Normal file
View File

@@ -0,0 +1,57 @@
[Eng](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README_en.md)|[中文](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README.md)
## Introduction
This repo is used to learn basic RL algorithms, we will make it **detailed comment** and **clear structure** as much as possible:
The code structure mainly contains several scripts as following
* ```model.py``` basic network model of RL, like MLP, CNN
* ```memory.py``` Replay Buffer
* ```plot.py``` use seaborn to plot rewards curvesaved in folder ``` result```.
* ```env.py``` to custom or normalize environments
* ```agent.py``` core algorithms, include a python Class with functions(choose action, update)
* ```main.py``` main function
Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in different algorithmsthus they are put into ```common``` folder。
## Runnig Environment
python 3.7.9、pytorch 1.6.0、gym 0.18.0
## Usage
Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)
## Schedule
| Name | Related materials | Used Envs | Notes |
| :----------------------------------------------------------: | :---------------------------------------------------------: | ------------------------------------------------------------ | :----------------------------------------------------------: |
| [On-Policy First-Visit MC](./MonteCarlo) | | [Racetrack](./envs/racetrack_env.md) | |
| [Q-Learning](./QLearning) | | [CliffWalking-v0](./envs/gym_info.md) | |
| [Sarsa](./Sarsa) | | [Racetrack](./envs/racetrack_env.md) | |
| [DQN](./DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
| [DQN-cnn](./DQN_cnn) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
| [DoubleDQN](./DoubleDQN) | | [CartPole-v0](./envs/gym_info.md) | not well |
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
| [PolicyGradient](./PolicyGradient) | | [CartPole-v0](./envs/gym_info.md) | |
| A2C | | [CartPole-v0](./envs/gym_info.md) | |
| A3C | | | |
| SAC | | | |
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
| GAIL | | | |
## Refs
[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2)
[RL-Adventure](https://github.com/higgsfield/RL-Adventure)
https://www.cnblogs.com/lucifer1997/p/13458563.html

View File

@@ -14,17 +14,17 @@ from collections import defaultdict
import torch import torch
class Sarsa(object): class Sarsa(object):
def __init__(self, def __init__(self,
n_actions,sarsa_cfg,): action_dim,sarsa_cfg,):
self.n_actions = n_actions # number of actions self.action_dim = action_dim # number of actions
self.lr = sarsa_cfg.lr # learning rate self.lr = sarsa_cfg.lr # learning rate
self.gamma = sarsa_cfg.gamma self.gamma = sarsa_cfg.gamma
self.epsilon = sarsa_cfg.epsilon self.epsilon = sarsa_cfg.epsilon
self.Q = defaultdict(lambda: np.zeros(n_actions)) self.Q = defaultdict(lambda: np.zeros(action_dim))
# self.Q = np.zeros((n_states, n_actions)) # Q表 # self.Q = np.zeros((state_dim, action_dim)) # Q表
def choose_action(self, state): def choose_action(self, state):
best_action = np.argmax(self.Q[state]) best_action = np.argmax(self.Q[state])
# action = best_action # action = best_action
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
action_probs[best_action] += (1.0 - self.epsilon) action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
return action return action

View File

@@ -70,8 +70,8 @@ def sarsa_train(cfg,env,agent):
if __name__ == "__main__": if __name__ == "__main__":
sarsa_cfg = SarsaConfig() sarsa_cfg = SarsaConfig()
env = RacetrackEnv() env = RacetrackEnv()
n_actions=9 action_dim=9
agent = Sarsa(n_actions,sarsa_cfg) agent = Sarsa(action_dim,sarsa_cfg)
rewards,ma_rewards = sarsa_train(sarsa_cfg,env,agent) rewards,ma_rewards = sarsa_train(sarsa_cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH) agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-12 21:14:12 Date: 2021-03-12 21:14:12
LastEditor: John LastEditor: John
LastEditTime: 2021-03-23 16:35:46 LastEditTime: 2021-03-24 22:15:00
Discription: Discription:
Environment: Environment:
''' '''
@@ -14,16 +14,16 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torch.distributions import Categorical from torch.distributions import Categorical
class MLP2(nn.Module): class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128): def __init__(self, state_dim,action_dim,hidden_dim=128):
""" 初始化q网络为全连接网络 """ 初始化q网络为全连接网络
n_states: 输入的feature即环境的state数目 state_dim: 输入的feature即环境的state数目
n_actions: 输出的action总个数 action_dim: 输出的action总个数
""" """
super(MLP2, self).__init__() super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
def forward(self, x): def forward(self, x):
# 各层对应的激活函数 # 各层对应的激活函数
@@ -32,10 +32,10 @@ class MLP2(nn.Module):
return self.fc3(x) return self.fc3(x)
class Critic(nn.Module): class Critic(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
super(Critic, self).__init__() super(Critic, self).__init__()
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, 1) self.linear3 = nn.Linear(hidden_size, 1)
# 随机初始化为较小的值 # 随机初始化为较小的值
@@ -51,11 +51,11 @@ class Critic(nn.Module):
return x return x
class Actor(nn.Module): class Actor(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
super(Actor, self).__init__() super(Actor, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size) self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, n_actions) self.linear3 = nn.Linear(hidden_size, action_dim)
self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ class Actor(nn.Module):
return x return x
class ActorCritic(nn.Module): class ActorCritic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256): def __init__(self, state_dim, action_dim, hidden_dim=256):
super(ActorCritic, self).__init__() super(ActorCritic, self).__init__()
self.critic = nn.Sequential( self.critic = nn.Sequential(
nn.Linear(n_states, hidden_dim), nn.Linear(state_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, 1) nn.Linear(hidden_dim, 1)
) )
self.actor = nn.Sequential( self.actor = nn.Sequential(
nn.Linear(n_states, hidden_dim), nn.Linear(state_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, n_actions), nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=1), nn.Softmax(dim=1),
) )

View File

@@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env):
self.natural = natural self.natural = natural
# Start the first game # Start the first game
self._reset() # Number of self._reset() # Number of
self.n_actions = 2 self.action_dim = 2
def reset(self): def reset(self):
return self._reset() return self._reset()

View File

@@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
self.shape = (4, 12) self.shape = (4, 12)
nS = np.prod(self.shape) nS = np.prod(self.shape)
n_actions = 4 action_dim = 4
# Cliff Location # Cliff Location
self._cliff = np.zeros(self.shape, dtype=np.bool) self._cliff = np.zeros(self.shape, dtype=np.bool)
@@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
P = {} P = {}
for s in range(nS): for s in range(nS):
position = np.unravel_index(s, self.shape) position = np.unravel_index(s, self.shape)
P[s] = { a : [] for a in range(n_actions) } P[s] = { a : [] for a in range(action_dim) }
P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
@@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
isd = np.zeros(nS) isd = np.zeros(nS)
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd) super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
def render(self, mode='human', close=False): def render(self, mode='human', close=False):
self._render(mode, close) self._render(mode, close)

View File

@@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv):
self.shape = shape self.shape = shape
nS = np.prod(shape) nS = np.prod(shape)
n_actions = 4 action_dim = 4
MAX_Y = shape[0] MAX_Y = shape[0]
MAX_X = shape[1] MAX_X = shape[1]
@@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv):
y, x = it.multi_index y, x = it.multi_index
# P[s][a] = (prob, next_state, reward, is_done) # P[s][a] = (prob, next_state, reward, is_done)
P[s] = {a : [] for a in range(n_actions)} P[s] = {a : [] for a in range(action_dim)}
is_done = lambda s: s == 0 or s == (nS - 1) is_done = lambda s: s == 0 or s == (nS - 1)
reward = 0.0 if is_done(s) else -1.0 reward = 0.0 if is_done(s) else -1.0
@@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv):
# This should not be used in any model-free learning algorithm # This should not be used in any model-free learning algorithm
self.P = P self.P = P
super(GridworldEnv, self).__init__(nS, n_actions, P, isd) super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
def _render(self, mode='human', close=False): def _render(self, mode='human', close=False):
""" Renders the current gridworld layout """ Renders the current gridworld layout

View File

@@ -0,0 +1,53 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:12:19
LastEditor: John
LastEditTime: 2021-03-26 17:12:43
Discription:
Environment:
'''
import numpy as np
import random
class StochasticMDP:
def __init__(self):
self.end = False
self.curr_state = 2
self.action_dim = 2
self.state_dim = 6
self.p_right = 0.5
def reset(self):
self.end = False
self.curr_state = 2
state = np.zeros(self.state_dim)
state[self.curr_state - 1] = 1.
return state
def step(self, action):
if self.curr_state != 1:
if action == 1:
if random.random() < self.p_right and self.curr_state < self.state_dim:
self.curr_state += 1
else:
self.curr_state -= 1
if action == 0:
self.curr_state -= 1
if self.curr_state == self.state_dim:
self.end = True
state = np.zeros(self.state_dim)
state[self.curr_state - 1] = 1.
if self.curr_state == 1:
if self.end:
return state, 1.00, True, {}
else:
return state, 1.00/100.00, True, {}
else:
return state, 0.0, False, {}

View File

@@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
self.shape = (7, 10) self.shape = (7, 10)
nS = np.prod(self.shape) nS = np.prod(self.shape)
n_actions = 4 action_dim = 4
# Wind strength # Wind strength
winds = np.zeros(self.shape) winds = np.zeros(self.shape)
@@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
P = {} P = {}
for s in range(nS): for s in range(nS):
position = np.unravel_index(s, self.shape) position = np.unravel_index(s, self.shape)
P[s] = { a : [] for a in range(n_actions) } P[s] = { a : [] for a in range(action_dim) }
P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds) P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
@@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
isd = np.zeros(nS) isd = np.zeros(nS)
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd) super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
def render(self, mode='human', close=False): def render(self, mode='human', close=False):
self._render(mode, close) self._render(mode, close)

View File

@@ -30,23 +30,7 @@
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | | | [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
## 算法实战 ## 算法实战
| 算法名称 | 相关论文材料 | 环境 | 备注 | [点击](../codes)或者跳转```codes```文件夹下进入算法实战
| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
| [On-Policy First-Visit MC](../codes/MonteCarlo) | | [Racetrack](../codes/envs/racetrack_env.md) | |
| [Q-Learning](../codes/QLearning) | | [CliffWalking-v0](../codes/envs/gym_info.md) | |
| [Sarsa](../codes/Sarsa) | | [Racetrack](../codes/envs/racetrack_env.md) | |
| [DQN](../codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md) | |
| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DoubleDQN](../codes/DoubleDQN) | | [CartPole-v0](../codes/envs/gym_info.md) | 效果不好,待改进 |
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
| [PolicyGradient](../codes/PolicyGradient) | | [CartPole-v0](../codes/envs/gym_info.md) | |
| A2C | | [CartPole-v0](../codes/envs/gym_info.md) | |
| A3C | | | |
| SAC | | | |
| [PPO](../codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](../codes/envs/gym_info.md) | |
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](../codes/envs/gym_info.md) | |
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
| GAIL | | | |
## 贡献者 ## 贡献者