update
This commit is contained in:
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-24 22:18:18
|
||||
LastEditor: John
|
||||
LastEditTime: 2021-03-31 14:51:09
|
||||
LastEditTime: 2021-05-04 22:39:34
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -65,11 +65,11 @@ class HierarchicalDQN:
|
||||
if self.batch_size > len(self.memory):
|
||||
return
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
|
||||
state_batch = torch.tensor(state_batch,dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch,dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch))
|
||||
state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch,device=self.device,dtype=torch.int64).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch,device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch),device=self.device)
|
||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
|
||||
next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
|
||||
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
|
||||
@@ -79,17 +79,17 @@ class HierarchicalDQN:
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step()
|
||||
self.loss_numpy = loss.detach().numpy()
|
||||
self.loss_numpy = loss.detach().cpu().numpy()
|
||||
self.losses.append(self.loss_numpy)
|
||||
def update_meta(self):
|
||||
if self.batch_size > len(self.meta_memory):
|
||||
return
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
|
||||
state_batch = torch.tensor(state_batch,dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch,dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch))
|
||||
state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch,device=self.device,dtype=torch.int64).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch,device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch),device=self.device)
|
||||
q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
|
||||
next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
|
||||
expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
|
||||
@@ -99,7 +99,7 @@ class HierarchicalDQN:
|
||||
for param in self.meta_policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.meta_optimizer.step()
|
||||
self.meta_loss_numpy = meta_loss.detach().numpy()
|
||||
self.meta_loss_numpy = meta_loss.detach().cpu().numpy()
|
||||
self.meta_losses.append(self.meta_loss_numpy)
|
||||
|
||||
def save(self, path):
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-29 10:37:32
|
||||
LastEditor: John
|
||||
LastEditTime: 2021-03-31 14:58:49
|
||||
LastEditTime: 2021-05-04 22:35:56
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -21,27 +21,23 @@ import numpy as np
|
||||
import torch
|
||||
import gym
|
||||
|
||||
from common.utils import save_results
|
||||
from common.plot import plot_rewards,plot_losses
|
||||
from common.utils import save_results,make_dir
|
||||
from common.plot import plot_rewards
|
||||
from HierarchicalDQN.agent import HierarchicalDQN
|
||||
|
||||
SEQUENCE = datetime.datetime.now().strftime(
|
||||
curr_time = datetime.datetime.now().strftime(
|
||||
"%Y%m%d-%H%M%S") # obtain current time
|
||||
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
|
||||
if not os.path.exists(curr_path+"/saved_model/"):
|
||||
os.mkdir(curr_path+"/saved_model/")
|
||||
if not os.path.exists(SAVED_MODEL_PATH):
|
||||
os.mkdir(SAVED_MODEL_PATH)
|
||||
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
|
||||
if not os.path.exists(curr_path+"/results/"):
|
||||
os.mkdir(curr_path+"/results/")
|
||||
if not os.path.exists(RESULT_PATH):
|
||||
os.mkdir(RESULT_PATH)
|
||||
|
||||
|
||||
class HierarchicalDQNConfig:
|
||||
def __init__(self):
|
||||
self.algo = "H-DQN" # name of algo
|
||||
self.env = 'CartPole-v0'
|
||||
self.result_path = curr_path+"/outputs/" + self.env + \
|
||||
'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/outputs/" + self.env + \
|
||||
'/'+curr_time+'/models/' # path to save models
|
||||
self.train_eps = 300 # 训练的episode数目
|
||||
self.eval_eps = 50 # 测试的episode数目
|
||||
self.gamma = 0.99
|
||||
self.epsilon_start = 1 # start epsilon of e-greedy policy
|
||||
self.epsilon_end = 0.01
|
||||
@@ -49,19 +45,25 @@ class HierarchicalDQNConfig:
|
||||
self.lr = 0.0001 # learning rate
|
||||
self.memory_capacity = 10000 # Replay Memory capacity
|
||||
self.batch_size = 32
|
||||
self.train_eps = 300 # 训练的episode数目
|
||||
self.target_update = 2 # target net的更新频率
|
||||
self.eval_eps = 20 # 测试的episode数目
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
|
||||
self.hidden_dim = 256 # dimension of hidden layer
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
agent = HierarchicalDQN(state_dim,action_dim,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
print('Start to train !')
|
||||
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = [] # moveing average reward
|
||||
for i_episode in range(cfg.train_eps):
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
done = False
|
||||
ep_reward = 0
|
||||
@@ -83,7 +85,7 @@ def train(cfg, env, agent):
|
||||
state = next_state
|
||||
agent.update()
|
||||
agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
|
||||
print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
|
||||
print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(
|
||||
@@ -93,18 +95,52 @@ def train(cfg, env, agent):
|
||||
print('Complete training!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
def eval(cfg, env, agent):
|
||||
print('Start to eval !')
|
||||
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = [] # moveing average reward
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
done = False
|
||||
ep_reward = 0
|
||||
while not done:
|
||||
goal = agent.set_goal(state)
|
||||
onehot_goal = agent.to_onehot(goal)
|
||||
extrinsic_reward = 0
|
||||
while not done and goal != np.argmax(state):
|
||||
goal_state = np.concatenate([state, onehot_goal])
|
||||
action = agent.choose_action(goal_state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
extrinsic_reward += reward
|
||||
state = next_state
|
||||
agent.update()
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}, Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}')
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(
|
||||
0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('Complete training!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
env = gym.make('CartPole-v0')
|
||||
env.seed(1)
|
||||
cfg = HierarchicalDQNConfig()
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
agent = HierarchicalDQN(state_dim, action_dim, cfg)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
agent.save(path=SAVED_MODEL_PATH)
|
||||
save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
|
||||
plot_rewards(rewards, ma_rewards, tag="train",
|
||||
algo=cfg.algo, path=RESULT_PATH)
|
||||
plot_losses(agent.losses,algo=cfg.algo, path=RESULT_PATH)
|
||||
|
||||
# train
|
||||
env,agent = env_agent_config(cfg,seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
|
||||
plot_rewards(rewards, ma_rewards, tag="train",
|
||||
algo=cfg.algo, path=cfg.result_path)
|
||||
# eval
|
||||
env,agent = env_agent_config(cfg,seed=10)
|
||||
agent.load(path=cfg.model_path)
|
||||
rewards,ma_rewards = eval(cfg,env,agent)
|
||||
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
|
||||
Reference in New Issue
Block a user