update
18
README.md
@@ -30,23 +30,7 @@
|
|||||||
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
|
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
|
||||||
## 算法实战
|
## 算法实战
|
||||||
|
|
||||||
| 算法名称 | 相关论文材料 | 环境 | 备注 |
|
[点击](./codes)或者跳转```codes```文件夹下进入算法实战
|
||||||
| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
|
|
||||||
| [On-Policy First-Visit MC](./codes/MonteCarlo) | | [Racetrack](./codes/envs/racetrack_env.md) | |
|
|
||||||
| [Q-Learning](./codes/QLearning) | | [CliffWalking-v0](./codes/envs/gym_info.md) | |
|
|
||||||
| [Sarsa](./codes/Sarsa) | | [Racetrack](./codes/envs/racetrack_env.md) | |
|
|
||||||
| [DQN](./codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | |
|
|
||||||
| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
|
|
||||||
| [DoubleDQN](./codes/DoubleDQN) | | [CartPole-v0](./codes/envs/gym_info.md) | 效果不好,待改进 |
|
|
||||||
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
|
|
||||||
| [PolicyGradient](./codes/PolicyGradient) | | [CartPole-v0](./codes/envs/gym_info.md) | |
|
|
||||||
| A2C | | [CartPole-v0](./codes/envs/gym_info.md) | |
|
|
||||||
| A3C | | | |
|
|
||||||
| SAC | | | |
|
|
||||||
| [PPO](./codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./codes/envs/gym_info.md) | |
|
|
||||||
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./codes/envs/gym_info.md) | |
|
|
||||||
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
|
|
||||||
| GAIL | | | |
|
|
||||||
|
|
||||||
## 贡献者
|
## 贡献者
|
||||||
|
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ from A2C.model import ActorCritic
|
|||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
|
|
||||||
class A2C:
|
class A2C:
|
||||||
def __init__(self,n_states, n_actions, cfg):
|
def __init__(self,state_dim, action_dim, cfg):
|
||||||
self.gamma = 0.99
|
self.gamma = 0.99
|
||||||
self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device)
|
self.model = ActorCritic(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(cfg.device)
|
||||||
self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
|
self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
|
||||||
def choose_action(self, state):
|
def choose_action(self, state):
|
||||||
dist, value = self.model(state)
|
dist, value = self.model(state)
|
||||||
|
|||||||
@@ -95,8 +95,8 @@ if __name__ == "__main__":
|
|||||||
cfg = A2CConfig()
|
cfg = A2CConfig()
|
||||||
env = gym.make('CartPole-v0')
|
env = gym.make('CartPole-v0')
|
||||||
env.seed(1) # set random seed for env
|
env.seed(1) # set random seed for env
|
||||||
n_states = env.observation_space.shape[0]
|
state_dim = env.observation_space.shape[0]
|
||||||
n_actions = env.action_space.n
|
action_dim = env.action_space.n
|
||||||
agent = A2C(n_states, n_actions, cfg)
|
agent = A2C(state_dim, action_dim, cfg)
|
||||||
train(cfg,env,agent)
|
train(cfg,env,agent)
|
||||||
|
|
||||||
|
|||||||
@@ -13,18 +13,18 @@ import torch.nn as nn
|
|||||||
from torch.distributions import Categorical
|
from torch.distributions import Categorical
|
||||||
|
|
||||||
class ActorCritic(nn.Module):
|
class ActorCritic(nn.Module):
|
||||||
def __init__(self, n_states, n_actions, hidden_dim=256):
|
def __init__(self, state_dim, action_dim, hidden_dim=256):
|
||||||
super(ActorCritic, self).__init__()
|
super(ActorCritic, self).__init__()
|
||||||
self.critic = nn.Sequential(
|
self.critic = nn.Sequential(
|
||||||
nn.Linear(n_states, hidden_dim),
|
nn.Linear(state_dim, hidden_dim),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Linear(hidden_dim, 1)
|
nn.Linear(hidden_dim, 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.actor = nn.Sequential(
|
self.actor = nn.Sequential(
|
||||||
nn.Linear(n_states, hidden_dim),
|
nn.Linear(state_dim, hidden_dim),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Linear(hidden_dim, n_actions),
|
nn.Linear(hidden_dim, action_dim),
|
||||||
nn.Softmax(dim=1),
|
nn.Softmax(dim=1),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -19,12 +19,12 @@ from common.memory import ReplayBuffer
|
|||||||
|
|
||||||
|
|
||||||
class DDPG:
|
class DDPG:
|
||||||
def __init__(self, n_states, n_actions, cfg):
|
def __init__(self, state_dim, action_dim, cfg):
|
||||||
self.device = cfg.device
|
self.device = cfg.device
|
||||||
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
|
self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
|
||||||
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
|
self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
|
||||||
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
|
self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
|
||||||
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
|
self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
|
||||||
|
|
||||||
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
|
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
|
||||||
target_param.data.copy_(param.data)
|
target_param.data.copy_(param.data)
|
||||||
|
|||||||
@@ -41,17 +41,17 @@ class OUNoise(object):
|
|||||||
self.max_sigma = max_sigma
|
self.max_sigma = max_sigma
|
||||||
self.min_sigma = min_sigma
|
self.min_sigma = min_sigma
|
||||||
self.decay_period = decay_period
|
self.decay_period = decay_period
|
||||||
self.n_actions = action_space.shape[0]
|
self.action_dim = action_space.shape[0]
|
||||||
self.low = action_space.low
|
self.low = action_space.low
|
||||||
self.high = action_space.high
|
self.high = action_space.high
|
||||||
self.reset()
|
self.reset()
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.obs = np.ones(self.n_actions) * self.mu
|
self.obs = np.ones(self.action_dim) * self.mu
|
||||||
|
|
||||||
def evolve_obs(self):
|
def evolve_obs(self):
|
||||||
x = self.obs
|
x = self.obs
|
||||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
|
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
|
||||||
self.obs = x + dx
|
self.obs = x + dx
|
||||||
return self.obs
|
return self.obs
|
||||||
|
|
||||||
|
|||||||
@@ -82,9 +82,9 @@ if __name__ == "__main__":
|
|||||||
cfg = DDPGConfig()
|
cfg = DDPGConfig()
|
||||||
env = NormalizedActions(gym.make("Pendulum-v0"))
|
env = NormalizedActions(gym.make("Pendulum-v0"))
|
||||||
env.seed(1) # 设置env随机种子
|
env.seed(1) # 设置env随机种子
|
||||||
n_states = env.observation_space.shape[0]
|
state_dim = env.observation_space.shape[0]
|
||||||
n_actions = env.action_space.shape[0]
|
action_dim = env.action_space.shape[0]
|
||||||
agent = DDPG(n_states,n_actions,cfg)
|
agent = DDPG(state_dim,action_dim,cfg)
|
||||||
rewards,ma_rewards = train(cfg,env,agent)
|
rewards,ma_rewards = train(cfg,env,agent)
|
||||||
agent.save(path=SAVED_MODEL_PATH)
|
agent.save(path=SAVED_MODEL_PATH)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
|||||||
@@ -46,15 +46,15 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
class FCN(nn.Module):
|
class FCN(nn.Module):
|
||||||
def __init__(self, n_states=4, n_actions=18):
|
def __init__(self, state_dim=4, action_dim=18):
|
||||||
""" 初始化q网络,为全连接网络
|
""" 初始化q网络,为全连接网络
|
||||||
n_states: 输入的feature即环境的state数目
|
state_dim: 输入的feature即环境的state数目
|
||||||
n_actions: 输出的action总个数
|
action_dim: 输出的action总个数
|
||||||
"""
|
"""
|
||||||
super(FCN, self).__init__()
|
super(FCN, self).__init__()
|
||||||
self.fc1 = nn.Linear(n_states, 128) # 输入层
|
self.fc1 = nn.Linear(state_dim, 128) # 输入层
|
||||||
self.fc2 = nn.Linear(128, 128) # 隐藏层
|
self.fc2 = nn.Linear(128, 128) # 隐藏层
|
||||||
self.fc3 = nn.Linear(128, n_actions) # 输出层
|
self.fc3 = nn.Linear(128, action_dim) # 输出层
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
# 各层对应的激活函数
|
# 各层对应的激活函数
|
||||||
@@ -66,8 +66,8 @@ class FCN(nn.Module):
|
|||||||
|
|
||||||
在```agent.py```中我们定义强化学习算法,包括```choose_action```和```update```两个主要函数,初始化中:
|
在```agent.py```中我们定义强化学习算法,包括```choose_action```和```update```两个主要函数,初始化中:
|
||||||
```python
|
```python
|
||||||
self.policy_net = FCN(n_states, n_actions).to(self.device)
|
self.policy_net = FCN(state_dim, action_dim).to(self.device)
|
||||||
self.target_net = FCN(n_states, n_actions).to(self.device)
|
self.target_net = FCN(state_dim, action_dim).to(self.device)
|
||||||
# target_net的初始模型参数完全复制policy_net
|
# target_net的初始模型参数完全复制policy_net
|
||||||
self.target_net.load_state_dict(self.policy_net.state_dict())
|
self.target_net.load_state_dict(self.policy_net.state_dict())
|
||||||
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
||||||
|
|||||||
@@ -20,11 +20,11 @@ import random
|
|||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from common.memory import ReplayBuffer
|
from common.memory import ReplayBuffer
|
||||||
from common.model import MLP2
|
from common.model import MLP
|
||||||
class DQN:
|
class DQN:
|
||||||
def __init__(self, n_states, n_actions, cfg):
|
def __init__(self, state_dim, action_dim, cfg):
|
||||||
|
|
||||||
self.n_actions = n_actions # 总的动作个数
|
self.action_dim = action_dim # 总的动作个数
|
||||||
self.device = cfg.device # 设备,cpu或gpu等
|
self.device = cfg.device # 设备,cpu或gpu等
|
||||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||||
# e-greedy策略相关参数
|
# e-greedy策略相关参数
|
||||||
@@ -34,8 +34,8 @@ class DQN:
|
|||||||
self.epsilon_end = cfg.epsilon_end
|
self.epsilon_end = cfg.epsilon_end
|
||||||
self.epsilon_decay = cfg.epsilon_decay
|
self.epsilon_decay = cfg.epsilon_decay
|
||||||
self.batch_size = cfg.batch_size
|
self.batch_size = cfg.batch_size
|
||||||
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||||
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||||
# target_net的初始模型参数完全复制policy_net
|
# target_net的初始模型参数完全复制policy_net
|
||||||
self.target_net.load_state_dict(self.policy_net.state_dict())
|
self.target_net.load_state_dict(self.policy_net.state_dict())
|
||||||
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
||||||
@@ -64,7 +64,7 @@ class DQN:
|
|||||||
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
||||||
action = q_value.max(1)[1].item()
|
action = q_value.max(1)[1].item()
|
||||||
else:
|
else:
|
||||||
action = random.randrange(self.n_actions)
|
action = random.randrange(self.action_dim)
|
||||||
return action
|
return action
|
||||||
else:
|
else:
|
||||||
with torch.no_grad(): # 取消保存梯度
|
with torch.no_grad(): # 取消保存梯度
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
@Email: johnjim0816@gmail.com
|
@Email: johnjim0816@gmail.com
|
||||||
@Date: 2020-06-12 00:48:57
|
@Date: 2020-06-12 00:48:57
|
||||||
@LastEditor: John
|
@LastEditor: John
|
||||||
LastEditTime: 2021-03-17 20:35:37
|
LastEditTime: 2021-03-26 17:17:17
|
||||||
@Discription:
|
@Discription:
|
||||||
@Environment: python 3.7.7
|
@Environment: python 3.7.7
|
||||||
'''
|
'''
|
||||||
@@ -40,7 +40,7 @@ class DQNConfig:
|
|||||||
self.lr = 0.01 # 学习率
|
self.lr = 0.01 # 学习率
|
||||||
self.memory_capacity = 800 # Replay Memory容量
|
self.memory_capacity = 800 # Replay Memory容量
|
||||||
self.batch_size = 64
|
self.batch_size = 64
|
||||||
self.train_eps = 250 # 训练的episode数目
|
self.train_eps = 300 # 训练的episode数目
|
||||||
self.train_steps = 200 # 训练每个episode的最大长度
|
self.train_steps = 200 # 训练每个episode的最大长度
|
||||||
self.target_update = 2 # target net的更新频率
|
self.target_update = 2 # target net的更新频率
|
||||||
self.eval_eps = 20 # 测试的episode数目
|
self.eval_eps = 20 # 测试的episode数目
|
||||||
@@ -84,9 +84,9 @@ if __name__ == "__main__":
|
|||||||
cfg = DQNConfig()
|
cfg = DQNConfig()
|
||||||
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
|
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
|
||||||
env.seed(1) # 设置env随机种子
|
env.seed(1) # 设置env随机种子
|
||||||
n_states = env.observation_space.shape[0]
|
state_dim = env.observation_space.shape[0]
|
||||||
n_actions = env.action_space.n
|
action_dim = env.action_space.n
|
||||||
agent = DQN(n_states,n_actions,cfg)
|
agent = DQN(state_dim,action_dim,cfg)
|
||||||
rewards,ma_rewards = train(cfg,env,agent)
|
rewards,ma_rewards = train(cfg,env,agent)
|
||||||
agent.save(path=SAVED_MODEL_PATH)
|
agent.save(path=SAVED_MODEL_PATH)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
|||||||
BIN
codes/DQN/results/20210326-171704/ma_rewards_train.npy
Normal file
BIN
codes/DQN/results/20210326-171704/rewards_curve_train.png
Normal file
|
After Width: | Height: | Size: 27 KiB |
BIN
codes/DQN/results/20210326-171704/rewards_train.npy
Normal file
BIN
codes/DQN/results/20210326-171722/ma_rewards_train.npy
Normal file
BIN
codes/DQN/results/20210326-171722/rewards_curve_train.png
Normal file
|
After Width: | Height: | Size: 66 KiB |
BIN
codes/DQN/results/20210326-171722/rewards_train.npy
Normal file
BIN
codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth
Normal file
BIN
codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth
Normal file
2
codes/DQN_cnn/README.md
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# DQN with cnn
|
||||||
|
原理与[DQN](../DQN)相同,只是将神经网络换成卷积神经网络,用于二维观测信息(state或obervation)
|
||||||
39
codes/DoubleDQN/README.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
食用本篇之前,需要有DQN算法的基础,参考[DQN算法实战](../DQN)。
|
||||||
|
|
||||||
|
## 原理简介
|
||||||
|
|
||||||
|
Double-DQN是2016年提出的算法,灵感源自2010年的Double-Qlearning,可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)。
|
||||||
|
跟Nature DQN一样,Double-DQN也用了两个网络,一个当前网络(对应用$Q$表示),一个目标网络(对应一般用$Q'$表示,为方便区分,以下用$Q_{tar}$代替)。我们先回忆一下,对于非终止状态,目标$Q_{tar}$值计算如下
|
||||||
|

|
||||||
|
|
||||||
|
而在Double-DQN中,不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值,而是先从当前$Q$网络选择$Q$值最大对应的动作,然后代入到目标网络中计算对应的值:
|
||||||
|

|
||||||
|
Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢,但是很容易过犹不及,导致过度估计(Over Estimation),所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题, DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步,来达到消除过度估计的问题,感兴趣可以阅读原论文。
|
||||||
|
|
||||||
|
伪代码如下:
|
||||||
|

|
||||||
|
当然也可以两个网络可以同时为当前网络和目标网络,如下:
|
||||||
|

|
||||||
|
或者这样更好理解如何同时为当前网络和目标网络:
|
||||||
|

|
||||||
|
|
||||||
|
## 代码实战
|
||||||
|
完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理,其实Double DQN改进来很简单,基本只需要在```update```中修改几行代码,如下:
|
||||||
|
```python
|
||||||
|
'''以下是Nature DQN的q_target计算方式
|
||||||
|
next_q_state_value = self.target_net(
|
||||||
|
next_state_batch).max(1)[0].detach() # # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,])
|
||||||
|
#计算 q_target
|
||||||
|
#对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||||
|
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
|
||||||
|
'''
|
||||||
|
'''以下是Double DQNq_target计算方式,与NatureDQN稍有不同'''
|
||||||
|
next_target_values = self.target_net(
|
||||||
|
next_state_batch)
|
||||||
|
#选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a))
|
||||||
|
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
|
||||||
|
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
|
||||||
|
```
|
||||||
|
reward变化结果如下:
|
||||||
|

|
||||||
|
其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图,而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
@Email: johnjim0816@gmail.com
|
@Email: johnjim0816@gmail.com
|
||||||
@Date: 2020-06-12 00:50:49
|
@Date: 2020-06-12 00:50:49
|
||||||
@LastEditor: John
|
@LastEditor: John
|
||||||
LastEditTime: 2021-03-13 15:01:27
|
LastEditTime: 2021-03-28 11:07:35
|
||||||
@Discription:
|
@Discription:
|
||||||
@Environment: python 3.7.7
|
@Environment: python 3.7.7
|
||||||
'''
|
'''
|
||||||
@@ -16,16 +16,15 @@ LastEditTime: 2021-03-13 15:01:27
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
import torch.nn.functional as F
|
|
||||||
import random
|
import random
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from common.memory import ReplayBuffer
|
from common.memory import ReplayBuffer
|
||||||
from common.model import MLP2
|
from common.model import MLP
|
||||||
class DoubleDQN:
|
class DoubleDQN:
|
||||||
def __init__(self, n_states, n_actions, cfg):
|
def __init__(self, state_dim, action_dim, cfg):
|
||||||
|
|
||||||
self.n_actions = n_actions # 总的动作个数
|
self.action_dim = action_dim # 总的动作个数
|
||||||
self.device = cfg.device # 设备,cpu或gpu等
|
self.device = cfg.device # 设备,cpu或gpu等
|
||||||
self.gamma = cfg.gamma
|
self.gamma = cfg.gamma
|
||||||
# e-greedy策略相关参数
|
# e-greedy策略相关参数
|
||||||
@@ -34,8 +33,8 @@ class DoubleDQN:
|
|||||||
self.epsilon_end = cfg.epsilon_end
|
self.epsilon_end = cfg.epsilon_end
|
||||||
self.epsilon_decay = cfg.epsilon_decay
|
self.epsilon_decay = cfg.epsilon_decay
|
||||||
self.batch_size = cfg.batch_size
|
self.batch_size = cfg.batch_size
|
||||||
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||||
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||||
# target_net的初始模型参数完全复制policy_net
|
# target_net的初始模型参数完全复制policy_net
|
||||||
self.target_net.load_state_dict(self.policy_net.state_dict())
|
self.target_net.load_state_dict(self.policy_net.state_dict())
|
||||||
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
||||||
@@ -63,7 +62,7 @@ class DoubleDQN:
|
|||||||
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
||||||
action = q_value.max(1)[1].item()
|
action = q_value.max(1)[1].item()
|
||||||
else:
|
else:
|
||||||
action = random.randrange(self.n_actions)
|
action = random.randrange(self.action_dim)
|
||||||
return action
|
return action
|
||||||
def update(self):
|
def update(self):
|
||||||
|
|
||||||
|
|||||||
BIN
codes/DoubleDQN/assets/20201222145725907.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
codes/DoubleDQN/assets/20201222150225327.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 105 KiB |
|
After Width: | Height: | Size: 74 KiB |
|
After Width: | Height: | Size: 185 KiB |
|
After Width: | Height: | Size: 75 KiB |
@@ -5,7 +5,7 @@
|
|||||||
@Email: johnjim0816@gmail.com
|
@Email: johnjim0816@gmail.com
|
||||||
@Date: 2020-06-12 00:48:57
|
@Date: 2020-06-12 00:48:57
|
||||||
@LastEditor: John
|
@LastEditor: John
|
||||||
LastEditTime: 2021-03-17 20:11:19
|
LastEditTime: 2021-03-28 11:05:14
|
||||||
@Discription:
|
@Discription:
|
||||||
@Environment: python 3.7.7
|
@Environment: python 3.7.7
|
||||||
'''
|
'''
|
||||||
@@ -32,7 +32,7 @@ if not os.path.exists(RESULT_PATH):
|
|||||||
|
|
||||||
class DoubleDQNConfig:
|
class DoubleDQNConfig:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.algo = "Double DQN" # 算法名称
|
self.algo = "Double DQN" # name of algo
|
||||||
self.gamma = 0.99
|
self.gamma = 0.99
|
||||||
self.epsilon_start = 0.9 # e-greedy策略的初始epsilon
|
self.epsilon_start = 0.9 # e-greedy策略的初始epsilon
|
||||||
self.epsilon_end = 0.01
|
self.epsilon_end = 0.01
|
||||||
@@ -40,7 +40,7 @@ class DoubleDQNConfig:
|
|||||||
self.lr = 0.01 # 学习率
|
self.lr = 0.01 # 学习率
|
||||||
self.memory_capacity = 10000 # Replay Memory容量
|
self.memory_capacity = 10000 # Replay Memory容量
|
||||||
self.batch_size = 128
|
self.batch_size = 128
|
||||||
self.train_eps = 250 # 训练的episode数目
|
self.train_eps = 300 # 训练的episode数目
|
||||||
self.train_steps = 200 # 训练每个episode的最大长度
|
self.train_steps = 200 # 训练每个episode的最大长度
|
||||||
self.target_update = 2 # target net的更新频率
|
self.target_update = 2 # target net的更新频率
|
||||||
self.eval_eps = 20 # 测试的episode数目
|
self.eval_eps = 20 # 测试的episode数目
|
||||||
@@ -84,9 +84,9 @@ if __name__ == "__main__":
|
|||||||
cfg = DoubleDQNConfig()
|
cfg = DoubleDQNConfig()
|
||||||
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
|
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
|
||||||
env.seed(1) # 设置env随机种子
|
env.seed(1) # 设置env随机种子
|
||||||
n_states = env.observation_space.shape[0]
|
state_dim = env.observation_space.shape[0]
|
||||||
n_actions = env.action_space.n
|
action_dim = env.action_space.n
|
||||||
agent = DoubleDQN(n_states,n_actions,cfg)
|
agent = DoubleDQN(state_dim,action_dim,cfg)
|
||||||
rewards,ma_rewards = train(cfg,env,agent)
|
rewards,ma_rewards = train(cfg,env,agent)
|
||||||
agent.save(path=SAVED_MODEL_PATH)
|
agent.save(path=SAVED_MODEL_PATH)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
|||||||
BIN
codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy
Normal file
BIN
codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png
Normal file
|
After Width: | Height: | Size: 55 KiB |
BIN
codes/DoubleDQN/results/20210328-110516/rewards_train.npy
Normal file
102
codes/HierarchicalDQN/agent.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: John
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2021-03-24 22:18:18
|
||||||
|
LastEditor: John
|
||||||
|
LastEditTime: 2021-03-27 04:24:30
|
||||||
|
Discription:
|
||||||
|
Environment:
|
||||||
|
'''
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import numpy as np
|
||||||
|
import random,math
|
||||||
|
from HierarchicalDQN.model import MLP
|
||||||
|
from common.memory import ReplayBuffer
|
||||||
|
import torch.optim as optim
|
||||||
|
class HierarchicalDQN:
|
||||||
|
def __init__(self,state_dim,action_dim,cfg):
|
||||||
|
self.action_dim = action_dim
|
||||||
|
self.device = cfg.device
|
||||||
|
self.batch_size = cfg.batch_size
|
||||||
|
self.sample_count = 0
|
||||||
|
self.epsilon = 0
|
||||||
|
self.epsilon_start = cfg.epsilon_start
|
||||||
|
self.epsilon_end = cfg.epsilon_end
|
||||||
|
self.epsilon_decay = cfg.epsilon_decay
|
||||||
|
self.batch_size = cfg.batch_size
|
||||||
|
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
|
||||||
|
self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
|
||||||
|
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
|
||||||
|
self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
|
||||||
|
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
|
||||||
|
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
|
||||||
|
self.memory = ReplayBuffer(cfg.memory_capacity)
|
||||||
|
self.meta_memory = ReplayBuffer(cfg.memory_capacity)
|
||||||
|
def to_onehot(x):
|
||||||
|
oh = np.zeros(6)
|
||||||
|
oh[x - 1] = 1.
|
||||||
|
return oh
|
||||||
|
def set_goal(self,meta_state):
|
||||||
|
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||||
|
self.sample_count += 1
|
||||||
|
if random.random() > self.epsilon:
|
||||||
|
with torch.no_grad():
|
||||||
|
meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32)
|
||||||
|
q_value = self.policy_net(meta_state)
|
||||||
|
goal = q_value.max(1)[1].item()
|
||||||
|
else:
|
||||||
|
goal = random.randrange(self.action_dim)
|
||||||
|
goal = self.meta_policy_net(meta_state)
|
||||||
|
onehot_goal = self.to_onehot(goal)
|
||||||
|
return onehot_goal
|
||||||
|
def choose_action(self,state):
|
||||||
|
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||||
|
self.sample_count += 1
|
||||||
|
if random.random() > self.epsilon:
|
||||||
|
with torch.no_grad():
|
||||||
|
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||||
|
q_value = self.policy_net(state)
|
||||||
|
action = q_value.max(1)[1].item()
|
||||||
|
else:
|
||||||
|
action = random.randrange(self.action_dim)
|
||||||
|
return action
|
||||||
|
def update(self):
|
||||||
|
if self.batch_size > len(self.memory):
|
||||||
|
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
|
||||||
|
state_batch = torch.tensor(
|
||||||
|
state_batch, device=self.device, dtype=torch.float)
|
||||||
|
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
|
||||||
|
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
|
||||||
|
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
|
||||||
|
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)
|
||||||
|
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch)
|
||||||
|
next_state_values = self.target_net(next_state_batch).max(1)[0].detach()
|
||||||
|
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
|
||||||
|
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))
|
||||||
|
self.optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
for param in self.policy_net.parameters():
|
||||||
|
param.grad.data.clamp_(-1, 1)
|
||||||
|
self.optimizer.step()
|
||||||
|
|
||||||
|
if self.batch_size > len(self.meta_memory):
|
||||||
|
meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size)
|
||||||
|
meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float)
|
||||||
|
meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1)
|
||||||
|
meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float)
|
||||||
|
next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float)
|
||||||
|
meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1)
|
||||||
|
meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch)
|
||||||
|
next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach()
|
||||||
|
expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0])
|
||||||
|
meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1))
|
||||||
|
self.meta_optimizer.zero_grad()
|
||||||
|
meta_loss.backward()
|
||||||
|
for param in self.meta_policy_net.parameters():
|
||||||
|
param.grad.data.clamp_(-1, 1)
|
||||||
|
self.meta_optimizer.step()
|
||||||
|
|
||||||
|
|
||||||
97
codes/HierarchicalDQN/main.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: John
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2021-03-24 22:14:04
|
||||||
|
LastEditor: John
|
||||||
|
LastEditTime: 2021-03-27 04:23:43
|
||||||
|
Discription:
|
||||||
|
Environment:
|
||||||
|
'''
|
||||||
|
import sys,os
|
||||||
|
sys.path.append(os.getcwd()) # add current terminal path to sys.path
|
||||||
|
import gym
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import datetime
|
||||||
|
from HierarchicalDQN.agent import HierarchicalDQN
|
||||||
|
from common.plot import plot_rewards
|
||||||
|
from common.utils import save_results
|
||||||
|
|
||||||
|
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||||
|
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
|
||||||
|
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
|
||||||
|
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
|
||||||
|
if not os.path.exists(SAVED_MODEL_PATH):
|
||||||
|
os.mkdir(SAVED_MODEL_PATH)
|
||||||
|
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
|
||||||
|
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
|
||||||
|
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
|
||||||
|
if not os.path.exists(RESULT_PATH):
|
||||||
|
os.mkdir(RESULT_PATH)
|
||||||
|
|
||||||
|
class HierarchicalDQNConfig:
|
||||||
|
def __init__(self):
|
||||||
|
self.algo = "DQN" # name of algo
|
||||||
|
self.gamma = 0.99
|
||||||
|
self.epsilon_start = 0.95 # start epsilon of e-greedy policy
|
||||||
|
self.epsilon_end = 0.01
|
||||||
|
self.epsilon_decay = 200
|
||||||
|
self.lr = 0.01 # learning rate
|
||||||
|
self.memory_capacity = 800 # Replay Memory capacity
|
||||||
|
self.batch_size = 64
|
||||||
|
self.train_eps = 250 # 训练的episode数目
|
||||||
|
self.train_steps = 200 # 训练每个episode的最大长度
|
||||||
|
self.target_update = 2 # target net的更新频率
|
||||||
|
self.eval_eps = 20 # 测试的episode数目
|
||||||
|
self.eval_steps = 200 # 测试每个episode的最大长度
|
||||||
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
|
||||||
|
self.hidden_dim = 256 # dimension of hidden layer
|
||||||
|
|
||||||
|
def train(cfg,env,agent):
|
||||||
|
print('Start to train !')
|
||||||
|
rewards = []
|
||||||
|
ma_rewards = [] # moving average reward
|
||||||
|
ep_steps = []
|
||||||
|
for i_episode in range(cfg.train_eps):
|
||||||
|
state = env.reset()
|
||||||
|
extrinsic_reward = 0
|
||||||
|
for i_step in range(cfg.train_steps):
|
||||||
|
goal= agent.set_goal(state)
|
||||||
|
meta_state = state
|
||||||
|
goal_state = np.concatenate([state, goal])
|
||||||
|
action = agent.choose_action(state)
|
||||||
|
next_state, reward, done, _ = env.step(action)
|
||||||
|
extrinsic_reward += reward
|
||||||
|
intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0
|
||||||
|
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done)
|
||||||
|
state = next_state
|
||||||
|
agent.update()
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
if i_episode % cfg.target_update == 0:
|
||||||
|
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||||
|
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done))
|
||||||
|
ep_steps.append(i_step)
|
||||||
|
rewards.append(extrinsic_reward)
|
||||||
|
if ma_rewards:
|
||||||
|
ma_rewards.append(
|
||||||
|
0.9*ma_rewards[-1]+0.1*extrinsic_reward)
|
||||||
|
else:
|
||||||
|
ma_rewards.append(extrinsic_reward)
|
||||||
|
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
|
||||||
|
print('Complete training!')
|
||||||
|
return rewards,ma_rewards
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cfg = HierarchicalDQNConfig()
|
||||||
|
env = gym.make('CartPole-v0')
|
||||||
|
env.seed(1)
|
||||||
|
state_dim = env.observation_space.shape[0]
|
||||||
|
action_dim = env.action_space.n
|
||||||
|
agent = HierarchicalDQN(state_dim,action_dim,cfg)
|
||||||
|
rewards,ma_rewards = train(cfg,env,agent)
|
||||||
|
agent.save(path=SAVED_MODEL_PATH)
|
||||||
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)
|
||||||
24
codes/HierarchicalDQN/model.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: John
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2021-03-24 22:14:12
|
||||||
|
LastEditor: John
|
||||||
|
LastEditTime: 2021-03-24 22:17:09
|
||||||
|
Discription:
|
||||||
|
Environment:
|
||||||
|
'''
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
class MLP(nn.Module):
|
||||||
|
def __init__(self, state_dim,action_dim,hidden_dim=128):
|
||||||
|
super(MLP, self).__init__()
|
||||||
|
self.fc1 = nn.Linear(state_dim, hidden_dim)
|
||||||
|
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
||||||
|
self.fc3 = nn.Linear(hidden_dim, action_dim)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = F.relu(self.fc1(x))
|
||||||
|
x = F.relu(self.fc2(x))
|
||||||
|
return self.fc3(x)
|
||||||
@@ -16,11 +16,11 @@ import torch
|
|||||||
class FisrtVisitMC:
|
class FisrtVisitMC:
|
||||||
''' On-Policy First-Visit MC Control
|
''' On-Policy First-Visit MC Control
|
||||||
'''
|
'''
|
||||||
def __init__(self,n_actions,cfg):
|
def __init__(self,action_dim,cfg):
|
||||||
self.n_actions = n_actions
|
self.action_dim = action_dim
|
||||||
self.epsilon = cfg.epsilon
|
self.epsilon = cfg.epsilon
|
||||||
self.gamma = cfg.gamma
|
self.gamma = cfg.gamma
|
||||||
self.Q = defaultdict(lambda: np.zeros(n_actions))
|
self.Q = defaultdict(lambda: np.zeros(action_dim))
|
||||||
self.returns_sum = defaultdict(float) # sum of returns
|
self.returns_sum = defaultdict(float) # sum of returns
|
||||||
self.returns_count = defaultdict(float)
|
self.returns_count = defaultdict(float)
|
||||||
|
|
||||||
@@ -28,11 +28,11 @@ class FisrtVisitMC:
|
|||||||
''' e-greed policy '''
|
''' e-greed policy '''
|
||||||
if state in self.Q.keys():
|
if state in self.Q.keys():
|
||||||
best_action = np.argmax(self.Q[state])
|
best_action = np.argmax(self.Q[state])
|
||||||
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
|
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
|
||||||
action_probs[best_action] += (1.0 - self.epsilon)
|
action_probs[best_action] += (1.0 - self.epsilon)
|
||||||
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
||||||
else:
|
else:
|
||||||
action = np.random.randint(0,self.n_actions)
|
action = np.random.randint(0,self.action_dim)
|
||||||
return action
|
return action
|
||||||
def update(self,one_ep_transition):
|
def update(self,one_ep_transition):
|
||||||
# Find all (state, action) pairs we've visited in this one_ep_transition
|
# Find all (state, action) pairs we've visited in this one_ep_transition
|
||||||
|
|||||||
@@ -79,8 +79,8 @@ def mc_train(cfg,env,agent):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
mc_cfg = MCConfig()
|
mc_cfg = MCConfig()
|
||||||
env = RacetrackEnv()
|
env = RacetrackEnv()
|
||||||
n_actions=9
|
action_dim=9
|
||||||
agent = FisrtVisitMC(n_actions,mc_cfg)
|
agent = FisrtVisitMC(action_dim,mc_cfg)
|
||||||
rewards,ma_rewards= mc_train(mc_cfg,env,agent)
|
rewards,ma_rewards= mc_train(mc_cfg,env,agent)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH)
|
plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH)
|
||||||
|
|||||||
@@ -17,9 +17,9 @@ from PolicyGradient.model import MLP
|
|||||||
|
|
||||||
class PolicyGradient:
|
class PolicyGradient:
|
||||||
|
|
||||||
def __init__(self, n_states,cfg):
|
def __init__(self, state_dim,cfg):
|
||||||
self.gamma = cfg.gamma
|
self.gamma = cfg.gamma
|
||||||
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
|
self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
|
||||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
|
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
|
||||||
self.batch_size = cfg.batch_size
|
self.batch_size = cfg.batch_size
|
||||||
|
|
||||||
|
|||||||
@@ -80,9 +80,9 @@ if __name__ == "__main__":
|
|||||||
cfg = PGConfig()
|
cfg = PGConfig()
|
||||||
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要
|
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要
|
||||||
env.seed(1) # 设置env随机种子
|
env.seed(1) # 设置env随机种子
|
||||||
n_states = env.observation_space.shape[0]
|
state_dim = env.observation_space.shape[0]
|
||||||
n_actions = env.action_space.n
|
action_dim = env.action_space.n
|
||||||
agent = PolicyGradient(n_states,cfg)
|
agent = PolicyGradient(state_dim,cfg)
|
||||||
rewards, ma_rewards = train(cfg,env,agent)
|
rewards, ma_rewards = train(cfg,env,agent)
|
||||||
agent.save_model(SAVED_MODEL_PATH)
|
agent.save_model(SAVED_MODEL_PATH)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
|||||||
@@ -16,10 +16,10 @@ class MLP(nn.Module):
|
|||||||
输入:state维度
|
输入:state维度
|
||||||
输出:概率
|
输出:概率
|
||||||
'''
|
'''
|
||||||
def __init__(self,n_states,hidden_dim = 36):
|
def __init__(self,state_dim,hidden_dim = 36):
|
||||||
super(MLP, self).__init__()
|
super(MLP, self).__init__()
|
||||||
# 24和36为hidden layer的层数,可根据state_dim, n_actions的情况来改变
|
# 24和36为hidden layer的层数,可根据state_dim, action_dim的情况来改变
|
||||||
self.fc1 = nn.Linear(n_states, hidden_dim)
|
self.fc1 = nn.Linear(state_dim, hidden_dim)
|
||||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
||||||
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
|
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ Author: John
|
|||||||
Email: johnjim0816@gmail.com
|
Email: johnjim0816@gmail.com
|
||||||
Date: 2020-09-11 23:03:00
|
Date: 2020-09-11 23:03:00
|
||||||
LastEditor: John
|
LastEditor: John
|
||||||
LastEditTime: 2021-03-12 16:48:25
|
LastEditTime: 2021-03-26 16:51:01
|
||||||
Discription:
|
Discription:
|
||||||
Environment:
|
Environment:
|
||||||
'''
|
'''
|
||||||
@@ -16,39 +16,35 @@ from collections import defaultdict
|
|||||||
|
|
||||||
class QLearning(object):
|
class QLearning(object):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
n_actions,cfg):
|
action_dim,cfg):
|
||||||
self.n_actions = n_actions # number of actions
|
self.action_dim = action_dim # dimension of acgtion
|
||||||
self.lr = cfg.lr # learning rate
|
self.lr = cfg.lr # learning rate
|
||||||
self.gamma = cfg.gamma
|
self.gamma = cfg.gamma
|
||||||
self.epsilon = 0
|
self.epsilon = 0
|
||||||
self.sample_count = 0 # epsilon随训练的也就是采样次数逐渐衰减,所以需要计数
|
self.sample_count = 0
|
||||||
self.epsilon_start = cfg.epsilon_start
|
self.epsilon_start = cfg.epsilon_start
|
||||||
self.epsilon_end = cfg.epsilon_end
|
self.epsilon_end = cfg.epsilon_end
|
||||||
self.epsilon_decay = cfg.epsilon_decay
|
self.epsilon_decay = cfg.epsilon_decay
|
||||||
self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 使用字典存储Q表,个人比较喜欢这种,也可以用下面一行的二维数组表示,但是需要额外更改代码
|
self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value)
|
||||||
# self.Q_table = np.zeros((n_states, n_actions)) # Q表
|
|
||||||
def choose_action(self, state):
|
def choose_action(self, state):
|
||||||
self.sample_count += 1
|
self.sample_count += 1
|
||||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||||
math.exp(-1. * self.sample_count / self.epsilon_decay)
|
math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||||
# 随机选取0-1之间的值,如果大于epsilon就按照贪心策略选取action,否则随机选取
|
# e-greedy policy
|
||||||
if np.random.uniform(0, 1) > self.epsilon:
|
if np.random.uniform(0, 1) > self.epsilon:
|
||||||
action = np.argmax(self.Q_table[state])
|
action = np.argmax(self.Q_table[str(state)])
|
||||||
else:
|
else:
|
||||||
action = np.random.choice(self.n_actions) # 有一定概率随机探索选取一个动作
|
action = np.random.choice(self.action_dim)
|
||||||
return action
|
return action
|
||||||
|
|
||||||
def update(self, state, action, reward, next_state, done):
|
def update(self, state, action, reward, next_state, done):
|
||||||
Q_predict = self.Q_table[state][action]
|
Q_predict = self.Q_table[str(state)][action]
|
||||||
if done:
|
if done:
|
||||||
Q_target = reward # terminal state
|
Q_target = reward # terminal state
|
||||||
else:
|
else:
|
||||||
Q_target = reward + self.gamma * np.max(
|
Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)])
|
||||||
self.Q_table[next_state]) # Q_table-learning
|
self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
|
||||||
self.Q_table[state][action] += self.lr * (Q_target - Q_predict)
|
|
||||||
def save(self,path):
|
def save(self,path):
|
||||||
'''把 Q表格 的数据保存到文件中
|
|
||||||
'''
|
|
||||||
import dill
|
import dill
|
||||||
torch.save(
|
torch.save(
|
||||||
obj=self.Q_table,
|
obj=self.Q_table,
|
||||||
@@ -56,7 +52,5 @@ class QLearning(object):
|
|||||||
pickle_module=dill
|
pickle_module=dill
|
||||||
)
|
)
|
||||||
def load(self, path):
|
def load(self, path):
|
||||||
'''从文件中读取数据到 Q表格
|
|
||||||
'''
|
|
||||||
import dill
|
import dill
|
||||||
self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)
|
self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)
|
||||||
@@ -5,7 +5,7 @@ Author: John
|
|||||||
Email: johnjim0816@gmail.com
|
Email: johnjim0816@gmail.com
|
||||||
Date: 2020-09-11 23:03:00
|
Date: 2020-09-11 23:03:00
|
||||||
LastEditor: John
|
LastEditor: John
|
||||||
LastEditTime: 2021-03-12 21:16:50
|
LastEditTime: 2021-03-26 17:16:07
|
||||||
Discription:
|
Discription:
|
||||||
Environment:
|
Environment:
|
||||||
'''
|
'''
|
||||||
@@ -35,20 +35,18 @@ if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
|
|||||||
class QlearningConfig:
|
class QlearningConfig:
|
||||||
'''训练相关参数'''
|
'''训练相关参数'''
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_episodes = 200 # 训练的episode数目
|
self.train_eps = 200 # 训练的episode数目
|
||||||
self.gamma = 0.9 # reward的衰减率
|
self.gamma = 0.9 # reward的衰减率
|
||||||
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
|
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
|
||||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||||
self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率
|
self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率
|
||||||
self.lr = 0.1 # 学习率
|
self.lr = 0.1 # learning rate
|
||||||
|
|
||||||
def train(cfg,env,agent):
|
def train(cfg,env,agent):
|
||||||
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
|
rewards = []
|
||||||
# env = FrozenLakeWapper(env)
|
ma_rewards = [] # moving average reward
|
||||||
rewards = [] # 记录所有episode的reward
|
|
||||||
ma_rewards = [] # 滑动平均的reward
|
|
||||||
steps = [] # 记录所有episode的steps
|
steps = [] # 记录所有episode的steps
|
||||||
for i_episode in range(cfg.n_episodes):
|
for i_episode in range(cfg.train_eps):
|
||||||
ep_reward = 0 # 记录每个episode的reward
|
ep_reward = 0 # 记录每个episode的reward
|
||||||
ep_steps = 0 # 记录每个episode走了多少step
|
ep_steps = 0 # 记录每个episode走了多少step
|
||||||
state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)
|
state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)
|
||||||
@@ -63,12 +61,11 @@ def train(cfg,env,agent):
|
|||||||
break
|
break
|
||||||
steps.append(ep_steps)
|
steps.append(ep_steps)
|
||||||
rewards.append(ep_reward)
|
rewards.append(ep_reward)
|
||||||
# 计算滑动平均的reward
|
|
||||||
if ma_rewards:
|
if ma_rewards:
|
||||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||||
else:
|
else:
|
||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward))
|
print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward))
|
||||||
return rewards,ma_rewards
|
return rewards,ma_rewards
|
||||||
|
|
||||||
def eval(cfg,env,agent):
|
def eval(cfg,env,agent):
|
||||||
@@ -77,7 +74,7 @@ def eval(cfg,env,agent):
|
|||||||
rewards = [] # 记录所有episode的reward
|
rewards = [] # 记录所有episode的reward
|
||||||
ma_rewards = [] # 滑动平均的reward
|
ma_rewards = [] # 滑动平均的reward
|
||||||
steps = [] # 记录所有episode的steps
|
steps = [] # 记录所有episode的steps
|
||||||
for i_episode in range(cfg.n_episodes):
|
for i_episode in range(cfg.train_eps):
|
||||||
ep_reward = 0 # 记录每个episode的reward
|
ep_reward = 0 # 记录每个episode的reward
|
||||||
ep_steps = 0 # 记录每个episode走了多少step
|
ep_steps = 0 # 记录每个episode走了多少step
|
||||||
state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)
|
state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)
|
||||||
@@ -96,15 +93,15 @@ def eval(cfg,env,agent):
|
|||||||
ma_rewards.append(rewards[-1]*0.9+ep_reward*0.1)
|
ma_rewards.append(rewards[-1]*0.9+ep_reward*0.1)
|
||||||
else:
|
else:
|
||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward))
|
print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.train_eps,ep_reward))
|
||||||
return rewards,ma_rewards
|
return rewards,ma_rewards
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cfg = QlearningConfig()
|
cfg = QlearningConfig()
|
||||||
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
|
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
|
||||||
env = CliffWalkingWapper(env)
|
env = CliffWalkingWapper(env)
|
||||||
n_actions = env.action_space.n
|
action_dim = env.action_space.n
|
||||||
agent = QLearning(n_actions,cfg)
|
agent = QLearning(action_dim,cfg)
|
||||||
rewards,ma_rewards = train(cfg,env,agent)
|
rewards,ma_rewards = train(cfg,env,agent)
|
||||||
agent.save(path=SAVED_MODEL_PATH)
|
agent.save(path=SAVED_MODEL_PATH)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
|||||||
BIN
codes/QLearning/results/20210326-171621/ma_rewards_train.npy
Normal file
BIN
codes/QLearning/results/20210326-171621/rewards_curve_train.png
Normal file
|
After Width: | Height: | Size: 40 KiB |
BIN
codes/QLearning/results/20210326-171621/rewards_train.npy
Normal file
BIN
codes/QLearning/saved_model/20210326-171621/Qleaning_model.pkl
Normal file
57
codes/README_en.md
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
|
||||||
|
|
||||||
|
[Eng](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README_en.md)|[中文](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README.md)
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This repo is used to learn basic RL algorithms, we will make it **detailed comment** and **clear structure** as much as possible:
|
||||||
|
|
||||||
|
The code structure mainly contains several scripts as following:
|
||||||
|
|
||||||
|
* ```model.py``` basic network model of RL, like MLP, CNN
|
||||||
|
* ```memory.py``` Replay Buffer
|
||||||
|
* ```plot.py``` use seaborn to plot rewards curve,saved in folder ``` result```.
|
||||||
|
* ```env.py``` to custom or normalize environments
|
||||||
|
* ```agent.py``` core algorithms, include a python Class with functions(choose action, update)
|
||||||
|
* ```main.py``` main function
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in different algorithms,thus they are put into ```common``` folder。
|
||||||
|
|
||||||
|
## Runnig Environment
|
||||||
|
|
||||||
|
python 3.7.9、pytorch 1.6.0、gym 0.18.0
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)
|
||||||
|
|
||||||
|
## Schedule
|
||||||
|
|
||||||
|
| Name | Related materials | Used Envs | Notes |
|
||||||
|
| :----------------------------------------------------------: | :---------------------------------------------------------: | ------------------------------------------------------------ | :----------------------------------------------------------: |
|
||||||
|
| [On-Policy First-Visit MC](./MonteCarlo) | | [Racetrack](./envs/racetrack_env.md) | |
|
||||||
|
| [Q-Learning](./QLearning) | | [CliffWalking-v0](./envs/gym_info.md) | |
|
||||||
|
| [Sarsa](./Sarsa) | | [Racetrack](./envs/racetrack_env.md) | |
|
||||||
|
| [DQN](./DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
|
||||||
|
| [DQN-cnn](./DQN_cnn) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
|
||||||
|
| [DoubleDQN](./DoubleDQN) | | [CartPole-v0](./envs/gym_info.md) | not well |
|
||||||
|
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
|
||||||
|
| [PolicyGradient](./PolicyGradient) | | [CartPole-v0](./envs/gym_info.md) | |
|
||||||
|
| A2C | | [CartPole-v0](./envs/gym_info.md) | |
|
||||||
|
| A3C | | | |
|
||||||
|
| SAC | | | |
|
||||||
|
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
|
||||||
|
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
|
||||||
|
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
|
||||||
|
| GAIL | | | |
|
||||||
|
|
||||||
|
|
||||||
|
## Refs
|
||||||
|
|
||||||
|
|
||||||
|
[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2)
|
||||||
|
|
||||||
|
[RL-Adventure](https://github.com/higgsfield/RL-Adventure)
|
||||||
|
|
||||||
|
https://www.cnblogs.com/lucifer1997/p/13458563.html
|
||||||
@@ -14,17 +14,17 @@ from collections import defaultdict
|
|||||||
import torch
|
import torch
|
||||||
class Sarsa(object):
|
class Sarsa(object):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
n_actions,sarsa_cfg,):
|
action_dim,sarsa_cfg,):
|
||||||
self.n_actions = n_actions # number of actions
|
self.action_dim = action_dim # number of actions
|
||||||
self.lr = sarsa_cfg.lr # learning rate
|
self.lr = sarsa_cfg.lr # learning rate
|
||||||
self.gamma = sarsa_cfg.gamma
|
self.gamma = sarsa_cfg.gamma
|
||||||
self.epsilon = sarsa_cfg.epsilon
|
self.epsilon = sarsa_cfg.epsilon
|
||||||
self.Q = defaultdict(lambda: np.zeros(n_actions))
|
self.Q = defaultdict(lambda: np.zeros(action_dim))
|
||||||
# self.Q = np.zeros((n_states, n_actions)) # Q表
|
# self.Q = np.zeros((state_dim, action_dim)) # Q表
|
||||||
def choose_action(self, state):
|
def choose_action(self, state):
|
||||||
best_action = np.argmax(self.Q[state])
|
best_action = np.argmax(self.Q[state])
|
||||||
# action = best_action
|
# action = best_action
|
||||||
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
|
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
|
||||||
action_probs[best_action] += (1.0 - self.epsilon)
|
action_probs[best_action] += (1.0 - self.epsilon)
|
||||||
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
||||||
return action
|
return action
|
||||||
|
|||||||
@@ -70,8 +70,8 @@ def sarsa_train(cfg,env,agent):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sarsa_cfg = SarsaConfig()
|
sarsa_cfg = SarsaConfig()
|
||||||
env = RacetrackEnv()
|
env = RacetrackEnv()
|
||||||
n_actions=9
|
action_dim=9
|
||||||
agent = Sarsa(n_actions,sarsa_cfg)
|
agent = Sarsa(action_dim,sarsa_cfg)
|
||||||
rewards,ma_rewards = sarsa_train(sarsa_cfg,env,agent)
|
rewards,ma_rewards = sarsa_train(sarsa_cfg,env,agent)
|
||||||
agent.save(path=SAVED_MODEL_PATH)
|
agent.save(path=SAVED_MODEL_PATH)
|
||||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ Author: John
|
|||||||
Email: johnjim0816@gmail.com
|
Email: johnjim0816@gmail.com
|
||||||
Date: 2021-03-12 21:14:12
|
Date: 2021-03-12 21:14:12
|
||||||
LastEditor: John
|
LastEditor: John
|
||||||
LastEditTime: 2021-03-23 16:35:46
|
LastEditTime: 2021-03-24 22:15:00
|
||||||
Discription:
|
Discription:
|
||||||
Environment:
|
Environment:
|
||||||
'''
|
'''
|
||||||
@@ -14,16 +14,16 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch.distributions import Categorical
|
from torch.distributions import Categorical
|
||||||
|
|
||||||
class MLP2(nn.Module):
|
class MLP(nn.Module):
|
||||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
def __init__(self, state_dim,action_dim,hidden_dim=128):
|
||||||
""" 初始化q网络,为全连接网络
|
""" 初始化q网络,为全连接网络
|
||||||
n_states: 输入的feature即环境的state数目
|
state_dim: 输入的feature即环境的state数目
|
||||||
n_actions: 输出的action总个数
|
action_dim: 输出的action总个数
|
||||||
"""
|
"""
|
||||||
super(MLP2, self).__init__()
|
super(MLP, self).__init__()
|
||||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
|
||||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
# 各层对应的激活函数
|
# 各层对应的激活函数
|
||||||
@@ -32,10 +32,10 @@ class MLP2(nn.Module):
|
|||||||
return self.fc3(x)
|
return self.fc3(x)
|
||||||
|
|
||||||
class Critic(nn.Module):
|
class Critic(nn.Module):
|
||||||
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
|
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
|
||||||
super(Critic, self).__init__()
|
super(Critic, self).__init__()
|
||||||
|
|
||||||
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
|
self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
|
||||||
self.linear2 = nn.Linear(hidden_size, hidden_size)
|
self.linear2 = nn.Linear(hidden_size, hidden_size)
|
||||||
self.linear3 = nn.Linear(hidden_size, 1)
|
self.linear3 = nn.Linear(hidden_size, 1)
|
||||||
# 随机初始化为较小的值
|
# 随机初始化为较小的值
|
||||||
@@ -51,11 +51,11 @@ class Critic(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
class Actor(nn.Module):
|
class Actor(nn.Module):
|
||||||
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
|
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
|
||||||
super(Actor, self).__init__()
|
super(Actor, self).__init__()
|
||||||
self.linear1 = nn.Linear(n_obs, hidden_size)
|
self.linear1 = nn.Linear(n_obs, hidden_size)
|
||||||
self.linear2 = nn.Linear(hidden_size, hidden_size)
|
self.linear2 = nn.Linear(hidden_size, hidden_size)
|
||||||
self.linear3 = nn.Linear(hidden_size, n_actions)
|
self.linear3 = nn.Linear(hidden_size, action_dim)
|
||||||
|
|
||||||
self.linear3.weight.data.uniform_(-init_w, init_w)
|
self.linear3.weight.data.uniform_(-init_w, init_w)
|
||||||
self.linear3.bias.data.uniform_(-init_w, init_w)
|
self.linear3.bias.data.uniform_(-init_w, init_w)
|
||||||
@@ -67,18 +67,18 @@ class Actor(nn.Module):
|
|||||||
return x
|
return x
|
||||||
|
|
||||||
class ActorCritic(nn.Module):
|
class ActorCritic(nn.Module):
|
||||||
def __init__(self, n_states, n_actions, hidden_dim=256):
|
def __init__(self, state_dim, action_dim, hidden_dim=256):
|
||||||
super(ActorCritic, self).__init__()
|
super(ActorCritic, self).__init__()
|
||||||
self.critic = nn.Sequential(
|
self.critic = nn.Sequential(
|
||||||
nn.Linear(n_states, hidden_dim),
|
nn.Linear(state_dim, hidden_dim),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Linear(hidden_dim, 1)
|
nn.Linear(hidden_dim, 1)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.actor = nn.Sequential(
|
self.actor = nn.Sequential(
|
||||||
nn.Linear(n_states, hidden_dim),
|
nn.Linear(state_dim, hidden_dim),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
nn.Linear(hidden_dim, n_actions),
|
nn.Linear(hidden_dim, action_dim),
|
||||||
nn.Softmax(dim=1),
|
nn.Softmax(dim=1),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env):
|
|||||||
self.natural = natural
|
self.natural = natural
|
||||||
# Start the first game
|
# Start the first game
|
||||||
self._reset() # Number of
|
self._reset() # Number of
|
||||||
self.n_actions = 2
|
self.action_dim = 2
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
return self._reset()
|
return self._reset()
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
|
|||||||
self.shape = (4, 12)
|
self.shape = (4, 12)
|
||||||
|
|
||||||
nS = np.prod(self.shape)
|
nS = np.prod(self.shape)
|
||||||
n_actions = 4
|
action_dim = 4
|
||||||
|
|
||||||
# Cliff Location
|
# Cliff Location
|
||||||
self._cliff = np.zeros(self.shape, dtype=np.bool)
|
self._cliff = np.zeros(self.shape, dtype=np.bool)
|
||||||
@@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
|
|||||||
P = {}
|
P = {}
|
||||||
for s in range(nS):
|
for s in range(nS):
|
||||||
position = np.unravel_index(s, self.shape)
|
position = np.unravel_index(s, self.shape)
|
||||||
P[s] = { a : [] for a in range(n_actions) }
|
P[s] = { a : [] for a in range(action_dim) }
|
||||||
P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
|
P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
|
||||||
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
|
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
|
||||||
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
|
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
|
||||||
@@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
|
|||||||
isd = np.zeros(nS)
|
isd = np.zeros(nS)
|
||||||
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
|
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
|
||||||
|
|
||||||
super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd)
|
super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
|
||||||
|
|
||||||
def render(self, mode='human', close=False):
|
def render(self, mode='human', close=False):
|
||||||
self._render(mode, close)
|
self._render(mode, close)
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv):
|
|||||||
self.shape = shape
|
self.shape = shape
|
||||||
|
|
||||||
nS = np.prod(shape)
|
nS = np.prod(shape)
|
||||||
n_actions = 4
|
action_dim = 4
|
||||||
|
|
||||||
MAX_Y = shape[0]
|
MAX_Y = shape[0]
|
||||||
MAX_X = shape[1]
|
MAX_X = shape[1]
|
||||||
@@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv):
|
|||||||
y, x = it.multi_index
|
y, x = it.multi_index
|
||||||
|
|
||||||
# P[s][a] = (prob, next_state, reward, is_done)
|
# P[s][a] = (prob, next_state, reward, is_done)
|
||||||
P[s] = {a : [] for a in range(n_actions)}
|
P[s] = {a : [] for a in range(action_dim)}
|
||||||
|
|
||||||
is_done = lambda s: s == 0 or s == (nS - 1)
|
is_done = lambda s: s == 0 or s == (nS - 1)
|
||||||
reward = 0.0 if is_done(s) else -1.0
|
reward = 0.0 if is_done(s) else -1.0
|
||||||
@@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv):
|
|||||||
# This should not be used in any model-free learning algorithm
|
# This should not be used in any model-free learning algorithm
|
||||||
self.P = P
|
self.P = P
|
||||||
|
|
||||||
super(GridworldEnv, self).__init__(nS, n_actions, P, isd)
|
super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
|
||||||
|
|
||||||
def _render(self, mode='human', close=False):
|
def _render(self, mode='human', close=False):
|
||||||
""" Renders the current gridworld layout
|
""" Renders the current gridworld layout
|
||||||
|
|||||||
53
codes/envs/stochastic_mdp.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: John
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2021-03-24 22:12:19
|
||||||
|
LastEditor: John
|
||||||
|
LastEditTime: 2021-03-26 17:12:43
|
||||||
|
Discription:
|
||||||
|
Environment:
|
||||||
|
'''
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
class StochasticMDP:
|
||||||
|
def __init__(self):
|
||||||
|
self.end = False
|
||||||
|
self.curr_state = 2
|
||||||
|
self.action_dim = 2
|
||||||
|
self.state_dim = 6
|
||||||
|
self.p_right = 0.5
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.end = False
|
||||||
|
self.curr_state = 2
|
||||||
|
state = np.zeros(self.state_dim)
|
||||||
|
state[self.curr_state - 1] = 1.
|
||||||
|
return state
|
||||||
|
|
||||||
|
def step(self, action):
|
||||||
|
if self.curr_state != 1:
|
||||||
|
if action == 1:
|
||||||
|
if random.random() < self.p_right and self.curr_state < self.state_dim:
|
||||||
|
self.curr_state += 1
|
||||||
|
else:
|
||||||
|
self.curr_state -= 1
|
||||||
|
|
||||||
|
if action == 0:
|
||||||
|
self.curr_state -= 1
|
||||||
|
if self.curr_state == self.state_dim:
|
||||||
|
self.end = True
|
||||||
|
|
||||||
|
state = np.zeros(self.state_dim)
|
||||||
|
state[self.curr_state - 1] = 1.
|
||||||
|
|
||||||
|
if self.curr_state == 1:
|
||||||
|
if self.end:
|
||||||
|
return state, 1.00, True, {}
|
||||||
|
else:
|
||||||
|
return state, 1.00/100.00, True, {}
|
||||||
|
else:
|
||||||
|
return state, 0.0, False, {}
|
||||||
@@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
|
|||||||
self.shape = (7, 10)
|
self.shape = (7, 10)
|
||||||
|
|
||||||
nS = np.prod(self.shape)
|
nS = np.prod(self.shape)
|
||||||
n_actions = 4
|
action_dim = 4
|
||||||
|
|
||||||
# Wind strength
|
# Wind strength
|
||||||
winds = np.zeros(self.shape)
|
winds = np.zeros(self.shape)
|
||||||
@@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
|
|||||||
P = {}
|
P = {}
|
||||||
for s in range(nS):
|
for s in range(nS):
|
||||||
position = np.unravel_index(s, self.shape)
|
position = np.unravel_index(s, self.shape)
|
||||||
P[s] = { a : [] for a in range(n_actions) }
|
P[s] = { a : [] for a in range(action_dim) }
|
||||||
P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
|
P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
|
||||||
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
|
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
|
||||||
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
|
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
|
||||||
@@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
|
|||||||
isd = np.zeros(nS)
|
isd = np.zeros(nS)
|
||||||
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
|
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
|
||||||
|
|
||||||
super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)
|
super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
|
||||||
|
|
||||||
def render(self, mode='human', close=False):
|
def render(self, mode='human', close=False):
|
||||||
self._render(mode, close)
|
self._render(mode, close)
|
||||||
|
|||||||
@@ -30,23 +30,7 @@
|
|||||||
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
|
| [第十三章 AlphaStar 论文解读](https://datawhalechina.github.io/easy-rl/#/chapter13/chapter13) | | |
|
||||||
## 算法实战
|
## 算法实战
|
||||||
|
|
||||||
| 算法名称 | 相关论文材料 | 环境 | 备注 |
|
[点击](../codes)或者跳转```codes```文件夹下进入算法实战
|
||||||
| :--------------------------------------: | :---------------------------------------------------------: | ------------------------------------- | :--------------------------------: |
|
|
||||||
| [On-Policy First-Visit MC](../codes/MonteCarlo) | | [Racetrack](../codes/envs/racetrack_env.md) | |
|
|
||||||
| [Q-Learning](../codes/QLearning) | | [CliffWalking-v0](../codes/envs/gym_info.md) | |
|
|
||||||
| [Sarsa](../codes/Sarsa) | | [Racetrack](../codes/envs/racetrack_env.md) | |
|
|
||||||
| [DQN](../codes/DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md) | |
|
|
||||||
| DQN-cnn | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](../codes/envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
|
|
||||||
| [DoubleDQN](../codes/DoubleDQN) | | [CartPole-v0](../codes/envs/gym_info.md) | 效果不好,待改进 |
|
|
||||||
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
|
|
||||||
| [PolicyGradient](../codes/PolicyGradient) | | [CartPole-v0](../codes/envs/gym_info.md) | |
|
|
||||||
| A2C | | [CartPole-v0](../codes/envs/gym_info.md) | |
|
|
||||||
| A3C | | | |
|
|
||||||
| SAC | | | |
|
|
||||||
| [PPO](../codes/PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](../codes/envs/gym_info.md) | |
|
|
||||||
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](../codes/envs/gym_info.md) | |
|
|
||||||
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
|
|
||||||
| GAIL | | | |
|
|
||||||
|
|
||||||
## 贡献者
|
## 贡献者
|
||||||
|
|
||||||
|
|||||||