This commit is contained in:
JohnJim0816
2021-03-31 15:37:09 +08:00
parent 6a92f97138
commit b6f63a91bf
65 changed files with 1244 additions and 459 deletions

View File

@@ -0,0 +1,13 @@
# Hierarchical DQN
## 原理简介
Hierarchical DQN是一种分层强化学习方法与DQN相比增加了一个meta controller
![image-20210331153115575](assets/image-20210331153115575.png)
即学习时meta controller每次会生成一个goal然后controller或者说下面的actor就会达到这个goal直到done为止。这就相当于给agent增加了一个队长队长擅长制定局部目标指导agent前行这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
## 伪代码
![image-20210331153542314](assets/image-20210331153542314.png)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:18:18
LastEditor: John
LastEditTime: 2021-03-27 04:24:30
LastEditTime: 2021-03-31 14:51:09
Discription:
Environment:
'''
@@ -13,90 +13,103 @@ import torch
import torch.nn as nn
import numpy as np
import random,math
from HierarchicalDQN.model import MLP
from common.memory import ReplayBuffer
import torch.optim as optim
from common.model import MLP
from common.memory import ReplayBuffer
class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg):
self.state_dim = state_dim
self.action_dim = action_dim
self.gamma = cfg.gamma
self.device = cfg.device
self.batch_size = cfg.batch_size
self.sample_count = 0
self.epsilon = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.frame_idx = 0
self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
self.meta_memory = ReplayBuffer(cfg.memory_capacity)
def to_onehot(x):
oh = np.zeros(6)
self.loss_numpy = 0
self.meta_loss_numpy = 0
self.losses = []
self.meta_losses = []
def to_onehot(self,x):
oh = np.zeros(self.state_dim)
oh[x - 1] = 1.
return oh
def set_goal(self,meta_state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
def set_goal(self,state):
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32)
q_value = self.policy_net(meta_state)
goal = q_value.max(1)[1].item()
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
goal = self.meta_policy_net(state).max(1)[1].item()
else:
goal = random.randrange(self.action_dim)
goal = self.meta_policy_net(meta_state)
onehot_goal = self.to_onehot(goal)
return onehot_goal
goal = random.randrange(self.state_dim)
return goal
def choose_action(self,state):
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
self.sample_count += 1
if random.random() > self.epsilon:
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
return action
def update(self):
self.update_policy()
self.update_meta()
def update_policy(self):
if self.batch_size > len(self.memory):
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch)
next_state_values = self.target_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,dtype=torch.float)
action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch))
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values)
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters():
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
self.optimizer.step()
self.loss_numpy = loss.detach().numpy()
self.losses.append(self.loss_numpy)
def update_meta(self):
if self.batch_size > len(self.meta_memory):
meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size)
meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float)
meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1)
meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float)
next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float)
meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1)
meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch)
next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach()
expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0])
meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1))
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,dtype=torch.float)
action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
reward_batch = torch.tensor(reward_batch,dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch))
q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
meta_loss = nn.MSELoss()(q_values, expected_q_values)
self.meta_optimizer.zero_grad()
meta_loss.backward()
for param in self.meta_policy_net.parameters():
for param in self.meta_policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.meta_optimizer.step()
self.meta_loss_numpy = meta_loss.detach().numpy()
self.meta_losses.append(self.meta_loss_numpy)
def save(self, path):
torch.save(self.policy_net.state_dict(), path+'policy_checkpoint.pth')
torch.save(self.meta_policy_net.state_dict(), path+'meta_checkpoint.pth')
def load(self, path):
self.policy_net.load_state_dict(torch.load(path+'policy_checkpoint.pth'))
self.meta_policy_net.load_state_dict(torch.load(path+'meta_checkpoint.pth'))

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 311 KiB

File diff suppressed because one or more lines are too long

View File

@@ -3,95 +3,108 @@
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:04
Date: 2021-03-29 10:37:32
LastEditor: John
LastEditTime: 2021-03-27 04:23:43
LastEditTime: 2021-03-31 14:58:49
Discription:
Environment:
'''
import sys,os
sys.path.append(os.getcwd()) # add current terminal path to sys.path
import gym
curr_path = os.path.dirname(__file__)
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import datetime
import numpy as np
import torch
import datetime
from HierarchicalDQN.agent import HierarchicalDQN
from common.plot import plot_rewards
from common.utils import save_results
import gym
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
from common.utils import save_results
from common.plot import plot_rewards,plot_losses
from HierarchicalDQN.agent import HierarchicalDQN
SEQUENCE = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH):
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(curr_path+"/results/"):
os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class HierarchicalDQNConfig:
def __init__(self):
self.algo = "DQN" # name of algo
self.algo = "H-DQN" # name of algo
self.gamma = 0.99
self.epsilon_start = 0.95 # start epsilon of e-greedy policy
self.epsilon_start = 1 # start epsilon of e-greedy policy
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.01 # learning rate
self.memory_capacity = 800 # Replay Memory capacity
self.batch_size = 64
self.train_eps = 250 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 256 # dimension of hidden layer
self.lr = 0.0001 # learning rate
self.memory_capacity = 10000 # Replay Memory capacity
self.batch_size = 32
self.train_eps = 300 # 训练的episode数目
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 256 # dimension of hidden layer
def train(cfg,env,agent):
def train(cfg, env, agent):
print('Start to train !')
rewards = []
ma_rewards = [] # moving average reward
ep_steps = []
ma_rewards = [] # moveing average reward
for i_episode in range(cfg.train_eps):
state = env.reset()
extrinsic_reward = 0
for i_step in range(cfg.train_steps):
goal= agent.set_goal(state)
state = env.reset()
done = False
ep_reward = 0
while not done:
goal = agent.set_goal(state)
onehot_goal = agent.to_onehot(goal)
meta_state = state
goal_state = np.concatenate([state, goal])
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
extrinsic_reward += reward
intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done)
state = next_state
agent.update()
if done:
break
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done))
ep_steps.append(i_step)
rewards.append(extrinsic_reward)
extrinsic_reward = 0
while not done and goal != np.argmax(state):
goal_state = np.concatenate([state, onehot_goal])
action = agent.choose_action(goal_state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
extrinsic_reward += reward
intrinsic_reward = 1.0 if goal == np.argmax(
next_state) else 0.0
agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(
[next_state, onehot_goal]), done)
state = next_state
agent.update()
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*extrinsic_reward)
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(extrinsic_reward)
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
ma_rewards.append(ep_reward)
print('Complete training')
return rewards,ma_rewards
return rewards, ma_rewards
if __name__ == "__main__":
cfg = HierarchicalDQNConfig()
env = gym.make('CartPole-v0')
env.seed(1)
env.seed(1)
cfg = HierarchicalDQNConfig()
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent = HierarchicalDQN(state_dim, action_dim, cfg)
rewards, ma_rewards = train(cfg, env, agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)
save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=RESULT_PATH)
plot_losses(agent.losses,algo=cfg.algo, path=RESULT_PATH)

View File

@@ -1,24 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-24 22:14:12
LastEditor: John
LastEditTime: 2021-03-24 22:17:09
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, action_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB