This commit is contained in:
JohnJim0816
2021-03-31 15:37:09 +08:00
parent 6a92f97138
commit b6f63a91bf
65 changed files with 1244 additions and 459 deletions

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-09 20:25:52 @Date: 2020-06-09 20:25:52
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-17 20:43:25 LastEditTime: 2021-03-31 00:56:32
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -58,9 +58,7 @@ class DDPG:
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
# 注意critic将(s_t,a)作为输入 # 注意critic将(s_t,a)作为输入
policy_loss = self.critic(state, self.actor(state)) policy_loss = self.critic(state, self.actor(state))
policy_loss = -policy_loss.mean() policy_loss = -policy_loss.mean()
next_action = self.target_actor(next_state) next_action = self.target_actor(next_state)
target_value = self.target_critic(next_state, next_action.detach()) target_value = self.target_critic(next_state, next_action.detach())
expected_value = reward + (1.0 - done) * self.gamma * target_value expected_value = reward + (1.0 - done) * self.gamma * target_value
@@ -87,7 +85,7 @@ class DDPG:
param.data * self.soft_tau param.data * self.soft_tau
) )
def save(self,path): def save(self,path):
torch.save(self.target_net.state_dict(), path+'DDPG_checkpoint.pth') torch.save(self.actor.state_dict(), path+'checkpoint.pt')
def load(self,path): def load(self,path):
self.actor.load_state_dict(torch.load(path+'DDPG_checkpoint.pth')) self.actor.load_state_dict(torch.load(path+'checkpoint.pt'))

View File

@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21 @Date: 2020-06-11 20:58:21
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-19 19:57:00 LastEditTime: 2021-03-31 01:04:48
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import sys,os import sys,os
sys.path.append(os.getcwd()) # 添加当前终端路径 from pathlib import Path
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import torch import torch
import gym import gym
import numpy as np import numpy as np
@@ -20,27 +25,23 @@ from DDPG.env import NormalizedActions,OUNoise
from common.plot import plot_rewards from common.plot import plot_rewards
from common.utils import save_results from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹 if not os.path.exists(curr_path+"/saved_model/"): os.mkdir(curr_path+"/saved_model/")
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") if not os.path.exists(SAVED_MODEL_PATH): os.mkdir(SAVED_MODEL_PATH)
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
os.mkdir(SAVED_MODEL_PATH) if not os.path.exists(curr_path+"/results/"): os.mkdir(curr_path+"/results/")
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 if not os.path.exists(RESULT_PATH): os.mkdir(RESULT_PATH)
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
class DDPGConfig: class DDPGConfig:
def __init__(self): def __init__(self):
self.algo = 'DDPG'
self.gamma = 0.99 self.gamma = 0.99
self.critic_lr = 1e-3 self.critic_lr = 1e-3
self.actor_lr = 1e-4 self.actor_lr = 1e-4
self.memory_capacity = 10000 self.memory_capacity = 10000
self.batch_size = 128 self.batch_size = 128
self.train_eps =300 self.train_eps =300
self.train_steps = 200
self.eval_eps = 200 self.eval_eps = 200
self.eval_steps = 200 self.eval_steps = 200
self.target_update = 4 self.target_update = 4
@@ -56,19 +57,19 @@ def train(cfg,env,agent):
for i_episode in range(cfg.train_eps): for i_episode in range(cfg.train_eps):
state = env.reset() state = env.reset()
ou_noise.reset() ou_noise.reset()
done = False
ep_reward = 0 ep_reward = 0
for i_step in range(cfg.train_steps): i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state) action = agent.choose_action(state)
action = ou_noise.get_action( action = ou_noise.get_action(action, i_step) # 即paper中的random process
action, i_step) # 即paper中的random process
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
agent.memory.push(state, action, reward, next_state, done) agent.memory.push(state, action, reward, next_state, done)
agent.update() agent.update()
state = next_state state = next_state
if done: print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward))
break
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
ep_steps.append(i_step) ep_steps.append(i_step)
rewards.append(ep_reward) rewards.append(ep_reward)
if ma_rewards: if ma_rewards:

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

Binary file not shown.

View File

@@ -1,7 +1,7 @@
# DQN # DQN
## 原理简介 ## 原理简介
DQN是Q-leanning算法的优化和延伸Q-leaning中使用有限的Q表存储值的信息而DQN中则用神经网络替代Q表存储信息这样更适用于高维的情况相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/leedeeprl-notes/#/chapter6/chapter6)。 DQN是Q-leanning算法的优化和延伸Q-leaning中使用有限的Q表存储值的信息而DQN中则用神经网络替代Q表存储信息这样更适用于高维的情况相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
论文方面主要可以参考两篇一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net也可以叫做Nature DQN。 论文方面主要可以参考两篇一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net也可以叫做Nature DQN。

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-13 14:56:23 LastEditTime: 2021-03-30 17:01:26
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -13,6 +13,8 @@ LastEditTime: 2021-03-13 14:56:23
''' '''
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
@@ -23,61 +25,44 @@ from common.memory import ReplayBuffer
from common.model import MLP from common.model import MLP
class DQN: class DQN:
def __init__(self, state_dim, action_dim, cfg): def __init__(self, state_dim, action_dim, cfg):
self.action_dim = action_dim # 总的动作个数 self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子 self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数 # e-greedy策略相关参数
self.sample_count = 0 # 用于epsilon的衰减计数 self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = 0 self.epsilon = lambda frame_idx: cfg.epsilon_end + \
self.epsilon_start = cfg.epsilon_start (cfg.epsilon_start - cfg.epsilon_end) * \
self.epsilon_end = cfg.epsilon_end math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.policy_net = MLP(state_dim, action_dim,
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net self.target_net = MLP(state_dim, action_dim,
self.target_net.load_state_dict(self.policy_net.state_dict()) hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
# 可查parameters()与state_dict()的区别前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0 self.loss = 0
self.memory = ReplayBuffer(cfg.memory_capacity) self.memory = ReplayBuffer(cfg.memory_capacity)
def choose_action(self, state, train=True): def choose_action(self, state):
'''选择动作 '''选择动作
''' '''
if train: self.frame_idx += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ if random.random() > self.epsilon(self.frame_idx):
math.exp(-1. * self.sample_count / self.epsilon_decay) with torch.no_grad():
self.sample_count += 1 # 先转为张量便于丢给神经网络,state元素数据原本为float64
if random.random() > self.epsilon: # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
with torch.no_grad(): state = torch.tensor(
# 先转为张量便于丢给神经网络,state元素数据原本为float64 [state], device=self.device, dtype=torch.float32)
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
state = torch.tensor( q_value = self.policy_net(state)
[state], device=self.device, dtype=torch.float32) # tensor.max(1)返回每行的最大值以及对应的下标,
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
q_value = self.policy_net(state) # 所以tensor.max(1)[1]返回最大值对应的下标即action
# tensor.max(1)返回每行的最大值以及对应的下标, action = q_value.max(1)[1].item()
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) else:
# 所以tensor.max(1)[1]返回最大值对应的下标即action action = random.randrange(self.action_dim)
action = q_value.max(1)[1].item() return action
else:
action = random.randrange(self.action_dim)
return action
else:
with torch.no_grad(): # 取消保存梯度
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.target_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
return action
def update(self): def update(self):
if len(self.memory) < self.batch_size: if len(self.memory) < self.batch_size:
@@ -96,32 +81,31 @@ class DQN:
next_state_batch = torch.tensor( next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float) next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32( done_batch = torch.tensor(np.float32(
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量 done_batch), device=self.device)
'''计算当前(s_t,a)对应的Q(s_t, a)''' '''计算当前(s_t,a)对应的Q(s_t, a)'''
'''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])''' '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
q_values = self.policy_net(state_batch).gather( q_values = self.policy_net(state_batch).gather(
dim=1, index=action_batch) # 等价于self.forward dim=1, index=action_batch) # 等价于self.forward
# 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states # 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states
next_state_values = self.target_net( next_q_values = self.target_net(next_state_batch).max(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) 1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 expected_q_value # 计算 expected_q_value
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward # 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * \ expected_q_values = reward_batch + \
next_state_values * (1-done_batch[0]) self.gamma * next_q_values * (1-done_batch)
# self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss
# 优化模型 # 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
self.loss.backward() self.loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸 # for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1) # param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型 self.optimizer.step() # 更新模型
def save(self,path): def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth') torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self,path): def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth')) self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))

467
codes/DQN/main.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57 @Date: 2020-06-12 00:48:57
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-26 17:17:17 LastEditTime: 2021-03-30 16:59:19
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import sys,os import sys,os
sys.path.append(os.getcwd()) # 添加当前终端路径 from pathlib import Path
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import gym import gym
import torch import torch
import datetime import datetime
@@ -18,58 +23,52 @@ from DQN.agent import DQN
from common.plot import plot_rewards from common.plot import plot_rewards
from common.utils import save_results from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹 if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH) os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹 if not os.path.exists(curr_path+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH) os.mkdir(RESULT_PATH)
class DQNConfig: class DQNConfig:
def __init__(self): def __init__(self):
self.algo = "DQN" # 算法名称 self.algo = "DQN" # name of algo
self.gamma = 0.99 self.gamma = 0.95
self.epsilon_start = 0.95 # e-greedy策略的初始epsilon self.epsilon_start = 1 # e-greedy策略的初始epsilon
self.epsilon_end = 0.01 self.epsilon_end = 0.01
self.epsilon_decay = 200 self.epsilon_decay = 500
self.lr = 0.01 # 学习率 self.lr = 0.0001 # learning rate
self.memory_capacity = 800 # Replay Memory容量 self.memory_capacity = 10000 # Replay Memory容量
self.batch_size = 64 self.batch_size = 32
self.train_eps = 300 # 训练的episode数目 self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率 self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目 self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 128 # 神经网络隐藏层维度 self.hidden_dim = 256 # 神经网络隐藏层维度
def train(cfg,env,agent): def train(cfg,env,agent):
print('Start to train !') print('Start to train !')
rewards = [] rewards = []
ma_rewards = [] # 滑动平均的reward ma_rewards = [] # moveing average reward
ep_steps = []
for i_episode in range(cfg.train_eps): for i_episode in range(cfg.train_eps):
state = env.reset() # reset环境状态 state = env.reset()
done = False
ep_reward = 0 ep_reward = 0
for i_step in range(cfg.train_steps): while not done:
action = agent.choose_action(state) # 根据当前环境state选择action action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action) # 更新环境参数 next_state, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory agent.memory.push(state, action, reward, next_state, done)
state = next_state # 跳转到下一个状态 state = next_state
agent.update() # 每步更新网络 agent.update()
if done:
break
# 更新target network复制DQN中的所有weights and biases
if i_episode % cfg.target_update == 0: if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict()) agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done)) print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward))
ep_steps.append(i_step)
rewards.append(ep_reward) rewards.append(ep_reward)
# 计算滑动窗口的reward # 计算滑动窗口的reward
if ma_rewards: if ma_rewards:
@@ -82,8 +81,8 @@ def train(cfg,env,agent):
if __name__ == "__main__": if __name__ == "__main__":
cfg = DQNConfig() cfg = DQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0')
env.seed(1) # 设置env随机种子 env.seed(1)
state_dim = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n action_dim = env.action_space.n
agent = DQN(state_dim,action_dim,cfg) agent = DQN(state_dim,action_dim,cfg)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

View File

@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 10:01:09 @Date: 2020-06-11 10:01:09
@LastEditor: John @LastEditor: John
LastEditTime: 2021-03-23 20:43:28 LastEditTime: 2021-03-29 20:23:48
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import sys,os import sys,os
sys.path.append(os.getcwd()) # add current terminal path to sys.path from pathlib import Path
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import gym import gym
import torch import torch
import datetime import datetime
@@ -19,17 +24,15 @@ from DQN_cnn.agent import DQNcnn
from common.plot import plot_rewards from common.plot import plot_rewards
from common.utils import save_results from common.utils import save_results
sys.path.append(os.getcwd()) # add current terminal path to sys.path
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH) os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): if not os.path.exists(curr_path+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH): if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH) os.mkdir(RESULT_PATH)

View File

@@ -1,40 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
LastEditTime: 2021-01-20 18:58:37
@Discription:
@Environment: python 3.7.7
'''
import random
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # buffer的最大容量
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
'''以队列的方式将样本填入buffer中
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
'''随机采样batch_size个样本
'''
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = zip(*batch)
return state, action, reward, next_state, done
def __len__(self):
'''返回buffer的长度
'''
return len(self.buffer)

View File

@@ -1,30 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:47:02
@LastEditor: John
LastEditTime: 2020-08-19 16:55:54
@Discription:
@Environment: python 3.7.7
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, n_states=4, n_actions=18):
""" 初始化q网络为全连接网络
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

View File

@@ -1,51 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-12-22 15:22:17
LastEditor: John
LastEditTime: 2021-01-21 14:30:38
Discription:
Environment:
'''
import datetime
import os
import argparse
ALGO_NAME = 'Double DQN'
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/'
TRAIN_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
EVAL_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
def get_args():
'''模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.01, type=float)
parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--train_steps", default=200, type=int)
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 2 eisodes) to update target net ") # 更新频率
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
config = parser.parse_args()
return config

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-12-22 15:24:31
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from params import ALGO_NAME
def plot(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of '+ALGO_NAME)
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/results/"+ylabel+".png")
plt.show()
# plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/results/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards,ylabel='rewards_'+tag)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

View File

@@ -0,0 +1,13 @@
# Hierarchical DQN
## 原理简介
Hierarchical DQN是一种分层强化学习方法与DQN相比增加了一个meta controller
![image-20210331153115575](assets/image-20210331153115575.png)
即学习时meta controller每次会生成一个goal然后controller或者说下面的actor就会达到这个goal直到done为止。这就相当于给agent增加了一个队长队长擅长制定局部目标指导agent前行这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
## 伪代码
![image-20210331153542314](assets/image-20210331153542314.png)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-24 22:18:18 Date: 2021-03-24 22:18:18
LastEditor: John LastEditor: John
LastEditTime: 2021-03-27 04:24:30 LastEditTime: 2021-03-31 14:51:09
Discription: Discription:
Environment: Environment:
''' '''
@@ -13,90 +13,103 @@ import torch
import torch.nn as nn import torch.nn as nn
import numpy as np import numpy as np
import random,math import random,math
from HierarchicalDQN.model import MLP
from common.memory import ReplayBuffer
import torch.optim as optim import torch.optim as optim
from common.model import MLP
from common.memory import ReplayBuffer
class HierarchicalDQN: class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg): def __init__(self,state_dim,action_dim,cfg):
self.state_dim = state_dim
self.action_dim = action_dim self.action_dim = action_dim
self.gamma = cfg.gamma
self.device = cfg.device self.device = cfg.device
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.sample_count = 0 self.frame_idx = 0
self.epsilon = 0 self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr) self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr) self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity) self.memory = ReplayBuffer(cfg.memory_capacity)
self.meta_memory = ReplayBuffer(cfg.memory_capacity) self.meta_memory = ReplayBuffer(cfg.memory_capacity)
def to_onehot(x): self.loss_numpy = 0
oh = np.zeros(6) self.meta_loss_numpy = 0
self.losses = []
self.meta_losses = []
def to_onehot(self,x):
oh = np.zeros(self.state_dim)
oh[x - 1] = 1. oh[x - 1] = 1.
return oh return oh
def set_goal(self,meta_state): def set_goal(self,state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay) if random.random() > self.epsilon(self.frame_idx):
self.sample_count += 1
if random.random() > self.epsilon:
with torch.no_grad(): with torch.no_grad():
meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32) state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(meta_state) goal = self.meta_policy_net(state).max(1)[1].item()
goal = q_value.max(1)[1].item()
else: else:
goal = random.randrange(self.action_dim) goal = random.randrange(self.state_dim)
goal = self.meta_policy_net(meta_state) return goal
onehot_goal = self.to_onehot(goal)
return onehot_goal
def choose_action(self,state): def choose_action(self,state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay) self.frame_idx += 1
self.sample_count += 1 if random.random() > self.epsilon(self.frame_idx):
if random.random() > self.epsilon:
with torch.no_grad(): with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32) state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state) q_value = self.policy_net(state)
action = q_value.max(1)[1].item() action = q_value.max(1)[1].item()
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.action_dim)
return action return action
def update(self): def update(self):
self.update_policy()
self.update_meta()
def update_policy(self):
if self.batch_size > len(self.memory): if self.batch_size > len(self.memory):
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size) return
state_batch = torch.tensor( state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch, device=self.device, dtype=torch.float) state_batch = torch.tensor(state_batch,dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) reward_batch = torch.tensor(reward_batch,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) done_batch = torch.tensor(np.float32(done_batch))
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.target_net(next_state_batch).max(1)[0].detach() next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0]) expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) loss = nn.MSELoss()(q_values, expected_q_values)
self.optimizer.zero_grad() self.optimizer.zero_grad()
loss.backward() loss.backward()
for param in self.policy_net.parameters(): for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1) param.grad.data.clamp_(-1, 1)
self.optimizer.step() self.optimizer.step()
self.loss_numpy = loss.detach().numpy()
self.losses.append(self.loss_numpy)
def update_meta(self):
if self.batch_size > len(self.meta_memory): if self.batch_size > len(self.meta_memory):
meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size) return
meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float) state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1) state_batch = torch.tensor(state_batch,dtype=torch.float)
meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float) reward_batch = torch.tensor(reward_batch,dtype=torch.float)
meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1) next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch) done_batch = torch.tensor(np.float32(done_batch))
next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach() q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0]) next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1)) expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
meta_loss = nn.MSELoss()(q_values, expected_q_values)
self.meta_optimizer.zero_grad() self.meta_optimizer.zero_grad()
meta_loss.backward() meta_loss.backward()
for param in self.meta_policy_net.parameters(): for param in self.meta_policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1) param.grad.data.clamp_(-1, 1)
self.meta_optimizer.step() self.meta_optimizer.step()
self.meta_loss_numpy = meta_loss.detach().numpy()
self.meta_losses.append(self.meta_loss_numpy)
def save(self, path):
torch.save(self.policy_net.state_dict(), path+'policy_checkpoint.pth')
torch.save(self.meta_policy_net.state_dict(), path+'meta_checkpoint.pth')
def load(self, path):
self.policy_net.load_state_dict(torch.load(path+'policy_checkpoint.pth'))
self.meta_policy_net.load_state_dict(torch.load(path+'meta_checkpoint.pth'))

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

File diff suppressed because one or more lines are too long

View File

@@ -3,95 +3,108 @@
''' '''
Author: John Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:04 Date: 2021-03-29 10:37:32
LastEditor: John LastEditor: John
LastEditTime: 2021-03-27 04:23:43 LastEditTime: 2021-03-31 14:58:49
Discription: Discription:
Environment: Environment:
''' '''
import sys,os import sys,os
sys.path.append(os.getcwd()) # add current terminal path to sys.path curr_path = os.path.dirname(__file__)
import gym parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import datetime
import numpy as np import numpy as np
import torch import torch
import datetime import gym
from HierarchicalDQN.agent import HierarchicalDQN
from common.plot import plot_rewards
from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time from common.utils import save_results
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model from common.plot import plot_rewards,plot_losses
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): from HierarchicalDQN.agent import HierarchicalDQN
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
SEQUENCE = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH) os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): if not os.path.exists(curr_path+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH): if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH) os.mkdir(RESULT_PATH)
class HierarchicalDQNConfig: class HierarchicalDQNConfig:
def __init__(self): def __init__(self):
self.algo = "DQN" # name of algo self.algo = "H-DQN" # name of algo
self.gamma = 0.99 self.gamma = 0.99
self.epsilon_start = 0.95 # start epsilon of e-greedy policy self.epsilon_start = 1 # start epsilon of e-greedy policy
self.epsilon_end = 0.01 self.epsilon_end = 0.01
self.epsilon_decay = 200 self.epsilon_decay = 200
self.lr = 0.01 # learning rate self.lr = 0.0001 # learning rate
self.memory_capacity = 800 # Replay Memory capacity self.memory_capacity = 10000 # Replay Memory capacity
self.batch_size = 64 self.batch_size = 32
self.train_eps = 250 # 训练的episode数目 self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度 self.target_update = 2 # target net的更新频率
self.target_update = 2 # target net的更新频率 self.eval_eps = 20 # 测试的episode数目
self.eval_eps = 20 # 测试的episode数目 self.device = torch.device(
self.eval_steps = 200 # 测试每个episode的最大长度 "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu self.hidden_dim = 256 # dimension of hidden layer
self.hidden_dim = 256 # dimension of hidden layer
def train(cfg,env,agent):
def train(cfg, env, agent):
print('Start to train !') print('Start to train !')
rewards = [] rewards = []
ma_rewards = [] # moving average reward ma_rewards = [] # moveing average reward
ep_steps = []
for i_episode in range(cfg.train_eps): for i_episode in range(cfg.train_eps):
state = env.reset() state = env.reset()
extrinsic_reward = 0 done = False
for i_step in range(cfg.train_steps): ep_reward = 0
goal= agent.set_goal(state) while not done:
goal = agent.set_goal(state)
onehot_goal = agent.to_onehot(goal)
meta_state = state meta_state = state
goal_state = np.concatenate([state, goal]) extrinsic_reward = 0
action = agent.choose_action(state) while not done and goal != np.argmax(state):
next_state, reward, done, _ = env.step(action) goal_state = np.concatenate([state, onehot_goal])
extrinsic_reward += reward action = agent.choose_action(goal_state)
intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0 next_state, reward, done, _ = env.step(action)
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done) ep_reward += reward
state = next_state extrinsic_reward += reward
agent.update() intrinsic_reward = 1.0 if goal == np.argmax(
if done: next_state) else 0.0
break agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(
if i_episode % cfg.target_update == 0: [next_state, onehot_goal]), done)
agent.target_net.load_state_dict(agent.policy_net.state_dict()) state = next_state
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done)) agent.update()
ep_steps.append(i_step) agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
rewards.append(extrinsic_reward) print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
rewards.append(ep_reward)
if ma_rewards: if ma_rewards:
ma_rewards.append( ma_rewards.append(
0.9*ma_rewards[-1]+0.1*extrinsic_reward) 0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(extrinsic_reward) ma_rewards.append(ep_reward)
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
print('Complete training') print('Complete training')
return rewards,ma_rewards return rewards, ma_rewards
if __name__ == "__main__": if __name__ == "__main__":
cfg = HierarchicalDQNConfig()
env = gym.make('CartPole-v0') env = gym.make('CartPole-v0')
env.seed(1) env.seed(1)
cfg = HierarchicalDQNConfig()
state_dim = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n action_dim = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg) agent = HierarchicalDQN(state_dim, action_dim, cfg)
rewards,ma_rewards = train(cfg,env,agent) rewards, ma_rewards = train(cfg, env, agent)
agent.save(path=SAVED_MODEL_PATH) agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=RESULT_PATH)
plot_losses(agent.losses,algo=cfg.algo, path=RESULT_PATH)

View File

@@ -1,24 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:12
LastEditor: John
LastEditTime: 2021-03-24 22:17:09
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

View File

@@ -19,9 +19,10 @@
## 运行环境 ## 运行环境
python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0 python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0
## 使用说明 ## 使用说明
对应算法文件夹下运行```main.py```即可 运行```main.py```或者```main.ipynb```
## 算法进度 ## 算法进度
| 算法名称 | 相关论文材料 | 环境 | 备注 | | 算法名称 | 相关论文材料 | 环境 | 备注 |
@@ -29,17 +30,17 @@ python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0
| [On-Policy First-Visit MC](./MonteCarlo) | | [Racetrack](./envs/racetrack_env.md) | | | [On-Policy First-Visit MC](./MonteCarlo) | | [Racetrack](./envs/racetrack_env.md) | |
| [Q-Learning](./QLearning) | | [CliffWalking-v0](./envs/gym_info.md) | | | [Q-Learning](./QLearning) | | [CliffWalking-v0](./envs/gym_info.md) | |
| [Sarsa](./Sarsa) | | [Racetrack](./envs/racetrack_env.md) | | | [Sarsa](./Sarsa) | | [Racetrack](./envs/racetrack_env.md) | |
| [DQN](./DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | | | [DQN](./DQN) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
| [DQN-cnn](./DQN_cnn) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 | | [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DoubleDQN](./DoubleDQN) | | [CartPole-v0](./envs/gym_info.md) | 效果不好,待改进 | | [DoubleDQN](./DoubleDQN) | | [CartPole-v0](./envs/gym_info.md) | 效果不好,待改进 |
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | | | Hierarchical DQN | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | | |
| [PolicyGradient](./PolicyGradient) | | [CartPole-v0](./envs/gym_info.md) | | | [PolicyGradient](./PolicyGradient) | | [CartPole-v0](./envs/gym_info.md) | |
| A2C | | [CartPole-v0](./envs/gym_info.md) | | | A2C | | [CartPole-v0](./envs/gym_info.md) | |
| A3C | | | | | A3C | | | |
| SAC | | | | | SAC | | | |
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | | | [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | | | DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | | | TD3 | [TD3 Paper](https://arxiv.org/abs/1802.09477) | | |
| GAIL | | | | | GAIL | | | |

View File

@@ -24,7 +24,7 @@ Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in diff
python 3.7.9、pytorch 1.6.0、gym 0.18.0 python 3.7.9、pytorch 1.6.0、gym 0.18.0
## Usage ## Usage
Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md) run ```main.py``` or ```main.ipynb```
## Schedule ## Schedule

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-12 21:14:12 Date: 2021-03-12 21:14:12
LastEditor: John LastEditor: John
LastEditTime: 2021-03-24 22:15:00 LastEditTime: 2021-03-31 13:49:06
Discription: Discription:
Environment: Environment:
''' '''
@@ -15,15 +15,15 @@ import torch.nn.functional as F
from torch.distributions import Categorical from torch.distributions import Categorical
class MLP(nn.Module): class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128): def __init__(self, input_dim,output_dim,hidden_dim=128):
""" 初始化q网络为全连接网络 """ 初始化q网络为全连接网络
state_dim: 输入的feature即环境的state数目 input_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数 output_dim: 输出的action总个数
""" """
super(MLP, self).__init__() super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层
def forward(self, x): def forward(self, x):
# 各层对应的激活函数 # 各层对应的激活函数
@@ -32,10 +32,10 @@ class MLP(nn.Module):
return self.fc3(x) return self.fc3(x)
class Critic(nn.Module): class Critic(nn.Module):
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3):
super(Critic, self).__init__() super(Critic, self).__init__()
self.linear1 = nn.Linear(n_obs + action_dim, hidden_size) self.linear1 = nn.Linear(n_obs + output_dim, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, 1) self.linear3 = nn.Linear(hidden_size, 1)
# 随机初始化为较小的值 # 随机初始化为较小的值
@@ -51,11 +51,11 @@ class Critic(nn.Module):
return x return x
class Actor(nn.Module): class Actor(nn.Module):
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3):
super(Actor, self).__init__() super(Actor, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size) self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, action_dim) self.linear3 = nn.Linear(hidden_size, output_dim)
self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ class Actor(nn.Module):
return x return x
class ActorCritic(nn.Module): class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=256): def __init__(self, input_dim, output_dim, hidden_dim=256):
super(ActorCritic, self).__init__() super(ActorCritic, self).__init__()
self.critic = nn.Sequential( self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim), nn.Linear(input_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, 1) nn.Linear(hidden_dim, 1)
) )
self.actor = nn.Sequential( self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim), nn.Linear(input_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, action_dim), nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1), nn.Softmax(dim=1),
) )

View File

@@ -5,13 +5,13 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-10-07 20:57:11 Date: 2020-10-07 20:57:11
LastEditor: John LastEditor: John
LastEditTime: 2021-03-13 11:31:49 LastEditTime: 2021-03-31 14:05:52
Discription: Discription:
Environment: Environment:
''' '''
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
def plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path='./'): def plot_rewards(rewards,ma_rewards,tag="train",algo = "DQN",path='./'):
sns.set() sns.set()
plt.title("average learning curve of {}".format(algo)) plt.title("average learning curve of {}".format(algo))
plt.xlabel('epsiodes') plt.xlabel('epsiodes')
@@ -20,4 +20,13 @@ def plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC
plt.legend() plt.legend()
plt.savefig(path+"rewards_curve_{}".format(tag)) plt.savefig(path+"rewards_curve_{}".format(tag))
plt.show() plt.show()
def plot_losses(losses,algo = "DQN",path='./'):
sns.set()
plt.title("loss curve of {}".format(algo))
plt.xlabel('epsiodes')
plt.plot(losses,label='rewards')
plt.legend()
plt.savefig(path+"losses_curve")
plt.show()