This commit is contained in:
JohnJim0816
2021-03-31 15:37:09 +08:00
parent 6a92f97138
commit b6f63a91bf
65 changed files with 1244 additions and 459 deletions

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-09 20:25:52
@LastEditor: John
LastEditTime: 2021-03-17 20:43:25
LastEditTime: 2021-03-31 00:56:32
@Discription:
@Environment: python 3.7.7
'''
@@ -58,9 +58,7 @@ class DDPG:
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
# 注意critic将(s_t,a)作为输入
policy_loss = self.critic(state, self.actor(state))
policy_loss = -policy_loss.mean()
next_action = self.target_actor(next_state)
target_value = self.target_critic(next_state, next_action.detach())
expected_value = reward + (1.0 - done) * self.gamma * target_value
@@ -87,7 +85,7 @@ class DDPG:
param.data * self.soft_tau
)
def save(self,path):
torch.save(self.target_net.state_dict(), path+'DDPG_checkpoint.pth')
torch.save(self.actor.state_dict(), path+'checkpoint.pt')
def load(self,path):
self.actor.load_state_dict(torch.load(path+'DDPG_checkpoint.pth'))
self.actor.load_state_dict(torch.load(path+'checkpoint.pt'))

View File

@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2021-03-19 19:57:00
LastEditTime: 2021-03-31 01:04:48
@Discription:
@Environment: python 3.7.7
'''
import sys,os
sys.path.append(os.getcwd()) # 添加当前终端路径
from pathlib import Path
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import torch
import gym
import numpy as np
@@ -20,27 +25,23 @@ from DDPG.env import NormalizedActions,OUNoise
from common.plot import plot_rewards
from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"): os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(curr_path+"/results/"): os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH): os.mkdir(RESULT_PATH)
class DDPGConfig:
def __init__(self):
self.algo = 'DDPG'
self.gamma = 0.99
self.critic_lr = 1e-3
self.actor_lr = 1e-4
self.memory_capacity = 10000
self.batch_size = 128
self.train_eps =300
self.train_steps = 200
self.eval_eps = 200
self.eval_steps = 200
self.target_update = 4
@@ -56,19 +57,19 @@ def train(cfg,env,agent):
for i_episode in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
for i_step in range(cfg.train_steps):
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(
action, i_step) # 即paper中的random process
action = ou_noise.get_action(action, i_step) # 即paper中的random process
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if done:
break
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward))
ep_steps.append(i_step)
rewards.append(ep_reward)
if ma_rewards:

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

Binary file not shown.

View File

@@ -1,7 +1,7 @@
# DQN
## 原理简介
DQN是Q-leanning算法的优化和延伸Q-leaning中使用有限的Q表存储值的信息而DQN中则用神经网络替代Q表存储信息这样更适用于高维的情况相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/leedeeprl-notes/#/chapter6/chapter6)。
DQN是Q-leanning算法的优化和延伸Q-leaning中使用有限的Q表存储值的信息而DQN中则用神经网络替代Q表存储信息这样更适用于高维的情况相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
论文方面主要可以参考两篇一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net也可以叫做Nature DQN。

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2021-03-13 14:56:23
LastEditTime: 2021-03-30 17:01:26
@Discription:
@Environment: python 3.7.7
'''
@@ -13,6 +13,8 @@ LastEditTime: 2021-03-13 14:56:23
'''
import torch
import torch.nn as nn
import torch.optim as optim
@@ -23,61 +25,44 @@ from common.memory import ReplayBuffer
from common.model import MLP
class DQN:
def __init__(self, state_dim, action_dim, cfg):
self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.sample_count = 0 # 用于epsilon的衰减计数
self.epsilon = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
# 可查parameters()与state_dict()的区别前者require_grad=True
self.policy_net = MLP(state_dim, action_dim,
hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,
hidden_dim=cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0
self.memory = ReplayBuffer(cfg.memory_capacity)
def choose_action(self, state, train=True):
def choose_action(self, state):
'''选择动作
'''
if train:
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
return action
else:
with torch.no_grad(): # 取消保存梯度
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.target_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
return action
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
return action
def update(self):
if len(self.memory) < self.batch_size:
@@ -96,32 +81,31 @@ class DQN:
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
done_batch), device=self.device)
'''计算当前(s_t,a)对应的Q(s_t, a)'''
'''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
q_values = self.policy_net(state_batch).gather(
dim=1, index=action_batch) # 等价于self.forward
# 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states
next_state_values = self.target_net(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
next_q_values = self.target_net(next_state_batch).max(
1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 expected_q_value
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * \
next_state_values * (1-done_batch[0])
expected_q_values = reward_batch + \
self.gamma * next_q_values * (1-done_batch)
# self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss
# 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
self.loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
# for param in self.policy_net.parameters(): # clip防止梯度爆炸
# param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
def save(self,path):
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self,path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))

467
codes/DQN/main.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-03-26 17:17:17
LastEditTime: 2021-03-30 16:59:19
@Discription:
@Environment: python 3.7.7
'''
import sys,os
sys.path.append(os.getcwd()) # 添加当前终端路径
from pathlib import Path
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import gym
import torch
import datetime
@@ -18,58 +23,52 @@ from DQN.agent import DQN
from common.plot import plot_rewards
from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(curr_path+"/results/"):
os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class DQNConfig:
def __init__(self):
self.algo = "DQN" # 算法名称
self.gamma = 0.99
self.epsilon_start = 0.95 # e-greedy策略的初始epsilon
self.algo = "DQN" # name of algo
self.gamma = 0.95
self.epsilon_start = 1 # e-greedy策略的初始epsilon
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.01 # 学习率
self.memory_capacity = 800 # Replay Memory容量
self.batch_size = 64
self.epsilon_decay = 500
self.lr = 0.0001 # learning rate
self.memory_capacity = 10000 # Replay Memory容量
self.batch_size = 32
self.train_eps = 300 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 128 # 神经网络隐藏层维度
self.hidden_dim = 256 # 神经网络隐藏层维度
def train(cfg,env,agent):
print('Start to train !')
rewards = []
ma_rewards = [] # 滑动平均的reward
ep_steps = []
ma_rewards = [] # moveing average reward
for i_episode in range(cfg.train_eps):
state = env.reset() # reset环境状态
state = env.reset()
done = False
ep_reward = 0
for i_step in range(cfg.train_steps):
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
while not done:
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
state = next_state # 跳转到下一个状态
agent.update() # 每步更新网络
if done:
break
# 更新target network复制DQN中的所有weights and biases
agent.memory.push(state, action, reward, next_state, done)
state = next_state
agent.update()
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
ep_steps.append(i_step)
print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward))
rewards.append(ep_reward)
# 计算滑动窗口的reward
if ma_rewards:
@@ -82,8 +81,8 @@ def train(cfg,env,agent):
if __name__ == "__main__":
cfg = DQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
env = gym.make('CartPole-v0')
env.seed(1)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DQN(state_dim,action_dim,cfg)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

View File

@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 10:01:09
@LastEditor: John
LastEditTime: 2021-03-23 20:43:28
LastEditTime: 2021-03-29 20:23:48
@Discription:
@Environment: python 3.7.7
'''
import sys,os
sys.path.append(os.getcwd()) # add current terminal path to sys.path
from pathlib import Path
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import gym
import torch
import datetime
@@ -19,17 +24,15 @@ from DQN_cnn.agent import DQNcnn
from common.plot import plot_rewards
from common.utils import save_results
sys.path.append(os.getcwd()) # add current terminal path to sys.path
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(curr_path+"/results/"):
os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)

View File

@@ -1,40 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
LastEditTime: 2021-01-20 18:58:37
@Discription:
@Environment: python 3.7.7
'''
import random
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # buffer的最大容量
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
'''以队列的方式将样本填入buffer中
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
'''随机采样batch_size个样本
'''
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = zip(*batch)
return state, action, reward, next_state, done
def __len__(self):
'''返回buffer的长度
'''
return len(self.buffer)

View File

@@ -1,30 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:47:02
@LastEditor: John
LastEditTime: 2020-08-19 16:55:54
@Discription:
@Environment: python 3.7.7
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, n_states=4, n_actions=18):
""" 初始化q网络为全连接网络
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

View File

@@ -1,51 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-12-22 15:22:17
LastEditor: John
LastEditTime: 2021-01-21 14:30:38
Discription:
Environment:
'''
import datetime
import os
import argparse
ALGO_NAME = 'Double DQN'
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/'
TRAIN_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
EVAL_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
def get_args():
'''模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.01, type=float)
parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--train_steps", default=200, type=int)
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 2 eisodes) to update target net ") # 更新频率
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
config = parser.parse_args()
return config

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-12-22 15:24:31
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from params import ALGO_NAME
def plot(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of '+ALGO_NAME)
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/results/"+ylabel+".png")
plt.show()
# plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/results/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards,ylabel='rewards_'+tag)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

View File

@@ -0,0 +1,13 @@
# Hierarchical DQN
## 原理简介
Hierarchical DQN是一种分层强化学习方法与DQN相比增加了一个meta controller
![image-20210331153115575](assets/image-20210331153115575.png)
即学习时meta controller每次会生成一个goal然后controller或者说下面的actor就会达到这个goal直到done为止。这就相当于给agent增加了一个队长队长擅长制定局部目标指导agent前行这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
## 伪代码
![image-20210331153542314](assets/image-20210331153542314.png)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:18:18
LastEditor: John
LastEditTime: 2021-03-27 04:24:30
LastEditTime: 2021-03-31 14:51:09
Discription:
Environment:
'''
@@ -13,90 +13,103 @@ import torch
import torch.nn as nn
import numpy as np
import random,math
from HierarchicalDQN.model import MLP
from common.memory import ReplayBuffer
import torch.optim as optim
from common.model import MLP
from common.memory import ReplayBuffer
class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = cfg.gamma
self.device = cfg.device
self.batch_size = cfg.batch_size
self.sample_count = 0
self.epsilon = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.frame_idx = 0
self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
self.meta_memory = ReplayBuffer(cfg.memory_capacity)
def to_onehot(x):
oh = np.zeros(6)
self.loss_numpy = 0
self.meta_loss_numpy = 0
self.losses = []
self.meta_losses = []
def to_onehot(self,x):
oh = np.zeros(self.state_dim)
oh[x - 1] = 1.
return oh
def set_goal(self,meta_state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
def set_goal(self,state):
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32)
q_value = self.policy_net(meta_state)
goal = q_value.max(1)[1].item()
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
goal = self.meta_policy_net(state).max(1)[1].item()
else:
goal = random.randrange(self.action_dim)
goal = self.meta_policy_net(meta_state)
onehot_goal = self.to_onehot(goal)
return onehot_goal
goal = random.randrange(self.state_dim)
return goal
def choose_action(self,state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
return action
def update(self):
self.update_policy()
self.update_meta()
def update_policy(self):
if self.batch_size > len(self.memory):
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch)
next_state_values = self.target_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,dtype=torch.float)
action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch))
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values)
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters():
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
self.optimizer.step()
self.loss_numpy = loss.detach().numpy()
self.losses.append(self.loss_numpy)
def update_meta(self):
if self.batch_size > len(self.meta_memory):
meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size)
meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float)
meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1)
meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float)
next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float)
meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1)
meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch)
next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach()
expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0])
meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1))
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,dtype=torch.float)
action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch))
q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
meta_loss = nn.MSELoss()(q_values, expected_q_values)
self.meta_optimizer.zero_grad()
meta_loss.backward()
for param in self.meta_policy_net.parameters():
for param in self.meta_policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.meta_optimizer.step()
self.meta_loss_numpy = meta_loss.detach().numpy()
self.meta_losses.append(self.meta_loss_numpy)
def save(self, path):
torch.save(self.policy_net.state_dict(), path+'policy_checkpoint.pth')
torch.save(self.meta_policy_net.state_dict(), path+'meta_checkpoint.pth')
def load(self, path):
self.policy_net.load_state_dict(torch.load(path+'policy_checkpoint.pth'))
self.meta_policy_net.load_state_dict(torch.load(path+'meta_checkpoint.pth'))

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

File diff suppressed because one or more lines are too long

View File

@@ -3,95 +3,108 @@
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:04
Date: 2021-03-29 10:37:32
LastEditor: John
LastEditTime: 2021-03-27 04:23:43
LastEditTime: 2021-03-31 14:58:49
Discription:
Environment:
'''
import sys,os
sys.path.append(os.getcwd()) # add current terminal path to sys.path
import gym
curr_path = os.path.dirname(__file__)
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import datetime
import numpy as np
import torch
import datetime
from HierarchicalDQN.agent import HierarchicalDQN
from common.plot import plot_rewards
from common.utils import save_results
import gym
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
from common.utils import save_results
from common.plot import plot_rewards,plot_losses
from HierarchicalDQN.agent import HierarchicalDQN
SEQUENCE = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH):
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(curr_path+"/results/"):
os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class HierarchicalDQNConfig:
def __init__(self):
self.algo = "DQN" # name of algo
self.algo = "H-DQN" # name of algo
self.gamma = 0.99
self.epsilon_start = 0.95 # start epsilon of e-greedy policy
self.epsilon_start = 1 # start epsilon of e-greedy policy
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.01 # learning rate
self.memory_capacity = 800 # Replay Memory capacity
self.batch_size = 64
self.train_eps = 250 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 256 # dimension of hidden layer
self.lr = 0.0001 # learning rate
self.memory_capacity = 10000 # Replay Memory capacity
self.batch_size = 32
self.train_eps = 300 # 训练的episode数目
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 256 # dimension of hidden layer
def train(cfg,env,agent):
def train(cfg, env, agent):
print('Start to train !')
rewards = []
ma_rewards = [] # moving average reward
ep_steps = []
ma_rewards = [] # moveing average reward
for i_episode in range(cfg.train_eps):
state = env.reset()
extrinsic_reward = 0
for i_step in range(cfg.train_steps):
goal= agent.set_goal(state)
state = env.reset()
done = False
ep_reward = 0
while not done:
goal = agent.set_goal(state)
onehot_goal = agent.to_onehot(goal)
meta_state = state
goal_state = np.concatenate([state, goal])
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
extrinsic_reward += reward
intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done)
state = next_state
agent.update()
if done:
break
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done))
ep_steps.append(i_step)
rewards.append(extrinsic_reward)
extrinsic_reward = 0
while not done and goal != np.argmax(state):
goal_state = np.concatenate([state, onehot_goal])
action = agent.choose_action(goal_state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
extrinsic_reward += reward
intrinsic_reward = 1.0 if goal == np.argmax(
next_state) else 0.0
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(
[next_state, onehot_goal]), done)
state = next_state
agent.update()
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*extrinsic_reward)
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(extrinsic_reward)
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
ma_rewards.append(ep_reward)
print('Complete training')
return rewards,ma_rewards
return rewards, ma_rewards
if __name__ == "__main__":
cfg = HierarchicalDQNConfig()
env = gym.make('CartPole-v0')
env.seed(1)
env.seed(1)
cfg = HierarchicalDQNConfig()
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent = HierarchicalDQN(state_dim, action_dim, cfg)
rewards, ma_rewards = train(cfg, env, agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)
save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=RESULT_PATH)
plot_losses(agent.losses,algo=cfg.algo, path=RESULT_PATH)

View File

@@ -1,24 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:12
LastEditor: John
LastEditTime: 2021-03-24 22:17:09
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

View File

@@ -19,9 +19,10 @@
## 运行环境
python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0
## 使用说明
对应算法文件夹下运行```main.py```即可
运行```main.py```或者```main.ipynb```
## 算法进度
| 算法名称 | 相关论文材料 | 环境 | 备注 |
@@ -29,17 +30,17 @@ python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0
| [On-Policy First-Visit MC](./MonteCarlo) | | [Racetrack](./envs/racetrack_env.md) | |
| [Q-Learning](./QLearning) | | [CliffWalking-v0](./envs/gym_info.md) | |
| [Sarsa](./Sarsa) | | [Racetrack](./envs/racetrack_env.md) | |
| [DQN](./DQN) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
| [DQN-cnn](./DQN_cnn) | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DQN](./DQN) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | |
| [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DoubleDQN](./DoubleDQN) | | [CartPole-v0](./envs/gym_info.md) | 效果不好,待改进 |
| Hierarchical DQN | [Hierarchical DQN](https://arxiv.org/abs/1604.06057) | | |
| Hierarchical DQN | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | | |
| [PolicyGradient](./PolicyGradient) | | [CartPole-v0](./envs/gym_info.md) | |
| A2C | | [CartPole-v0](./envs/gym_info.md) | |
| A3C | | | |
| SAC | | | |
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
| DDPG | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
| TD3 | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) | | |
| TD3 | [TD3 Paper](https://arxiv.org/abs/1802.09477) | | |
| GAIL | | | |

View File

@@ -24,7 +24,7 @@ Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in diff
python 3.7.9、pytorch 1.6.0、gym 0.18.0
## Usage
Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)
run ```main.py``` or ```main.ipynb```
## Schedule

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-12 21:14:12
LastEditor: John
LastEditTime: 2021-03-24 22:15:00
LastEditTime: 2021-03-31 13:49:06
Discription:
Environment:
'''
@@ -15,15 +15,15 @@ import torch.nn.functional as F
from torch.distributions import Categorical
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
def __init__(self, input_dim,output_dim,hidden_dim=128):
""" 初始化q网络为全连接网络
state_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数
input_dim: 输入的feature即环境的state数目
output_dim: 输出的action总个数
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -32,10 +32,10 @@ class MLP(nn.Module):
return self.fc3(x)
class Critic(nn.Module):
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3):
super(Critic, self).__init__()
self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
self.linear1 = nn.Linear(n_obs + output_dim, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, 1)
# 随机初始化为较小的值
@@ -51,11 +51,11 @@ class Critic(nn.Module):
return x
class Actor(nn.Module):
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, action_dim)
self.linear3 = nn.Linear(hidden_size, output_dim)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ class Actor(nn.Module):
return x
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=256):
def __init__(self, input_dim, output_dim, hidden_dim=256):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1),
)

View File

@@ -5,13 +5,13 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-07 20:57:11
LastEditor: John
LastEditTime: 2021-03-13 11:31:49
LastEditTime: 2021-03-31 14:05:52
Discription:
Environment:
'''
import matplotlib.pyplot as plt
import seaborn as sns
def plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path='./'):
def plot_rewards(rewards,ma_rewards,tag="train",algo = "DQN",path='./'):
sns.set()
plt.title("average learning curve of {}".format(algo))
plt.xlabel('epsiodes')
@@ -20,4 +20,13 @@ def plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC
plt.legend()
plt.savefig(path+"rewards_curve_{}".format(tag))
plt.show()
def plot_losses(losses,algo = "DQN",path='./'):
sns.set()
plt.title("loss curve of {}".format(algo))
plt.xlabel('epsiodes')
plt.plot(losses,label='rewards')
plt.legend()
plt.savefig(path+"losses_curve")
plt.show()