hot update Double DQN
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2022-08-23 23:59:54
|
||||
LastEditTime: 2022-08-29 23:30:08
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -78,7 +78,7 @@ class DQN:
|
||||
self.batch_size)
|
||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
# print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape)
|
||||
@@ -91,7 +91,7 @@ class DQN:
|
||||
# compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly
|
||||
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
|
||||
# print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad)
|
||||
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
||||
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
||||
# backpropagation
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
@@ -9,130 +9,122 @@ import torch
|
||||
import datetime
|
||||
import numpy as np
|
||||
import argparse
|
||||
from common.utils import save_results,all_seed
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.utils import all_seed
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer
|
||||
from common.launcher import Launcher
|
||||
from envs.register import register_env
|
||||
from dqn import DQN
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
""" hyperparameters
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
# please manually change the following args in this script if you want
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models' )
|
||||
args = parser.parse_args()
|
||||
args = {**vars(args)} # type(dict)
|
||||
return args
|
||||
|
||||
def get_args():
|
||||
""" hyperparameters
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
# please manually change the following args in this script if you want
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models' )
|
||||
args = parser.parse_args()
|
||||
args = {**vars(args)} # type(dict)
|
||||
return args
|
||||
def env_agent_config(cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
|
||||
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
|
||||
agent = DQN(model,memory,cfg) # create agent
|
||||
return env, agent
|
||||
|
||||
def env_agent_config(cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
env = gym.make(cfg['env_name']) # create env
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
n_states = env.observation_space.shape[0] # state dimension
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
|
||||
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
|
||||
agent = DQN(model,memory,cfg) # create agent
|
||||
return env, agent
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg["train_eps"]):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
ep_step += 1
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # save transitions
|
||||
state = next_state # update next state for env
|
||||
agent.update() # update agent
|
||||
ep_reward += reward #
|
||||
if done:
|
||||
break
|
||||
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 10 == 0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
env.close()
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg["train_eps"]):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
ep_step += 1
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # save transitions
|
||||
state = next_state # update next state for env
|
||||
agent.update() # update agent
|
||||
ep_reward += reward #
|
||||
if done:
|
||||
break
|
||||
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 10 == 0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
env.close()
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
def test(cfg, env, agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
ep_step+=1
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
def test(cfg, env, agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
ep_step+=1
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# training
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
save_args(cfg,path = cfg['result_path']) # save parameters
|
||||
agent.save_model(path = cfg['model_path']) # save models
|
||||
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
|
||||
# testing
|
||||
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
|
||||
agent.load_model(path = cfg['model_path']) # load model
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path = cfg['result_path'])
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
@@ -1 +1,21 @@
|
||||
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 10, "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", "show_fig": false, "save_fig": true}
|
||||
{
|
||||
"algo_name": "DQN",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 200,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.95,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 500,
|
||||
"lr": 0.0001,
|
||||
"memory_capacity": 100000,
|
||||
"batch_size": 64,
|
||||
"target_update": 4,
|
||||
"hidden_dim": 256,
|
||||
"device": "cpu",
|
||||
"seed": 10,
|
||||
"result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results",
|
||||
"model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models",
|
||||
"show_fig": false,
|
||||
"save_fig": true
|
||||
}
|
||||
@@ -1 +1,24 @@
|
||||
{"algo_name": "DQN", "env_name": "CartPole-v1", "train_eps": 2000, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 6000, "lr": 1e-05, "memory_capacity": 200000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models", "n_states": 4, "n_actions": 2}
|
||||
{
|
||||
"algo_name": "DQN",
|
||||
"env_name": "CartPole-v1",
|
||||
"train_eps": 2000,
|
||||
"test_eps": 20,
|
||||
"ep_max_steps": 100000,
|
||||
"gamma": 0.99,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 6000,
|
||||
"lr": 1e-05,
|
||||
"memory_capacity": 200000,
|
||||
"batch_size": 64,
|
||||
"target_update": 4,
|
||||
"hidden_dim": 256,
|
||||
"device": "cuda",
|
||||
"seed": 10,
|
||||
"show_fig": false,
|
||||
"save_fig": true,
|
||||
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results",
|
||||
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models",
|
||||
"n_states": 4,
|
||||
"n_actions": 2
|
||||
}
|
||||
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2022-07-21 00:08:26
|
||||
LastEditTime: 2022-08-29 23:34:20
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -20,148 +20,87 @@ import torch.nn.functional as F
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
n_states: 输入的特征数即环境的状态维度
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
|
||||
class DoubleDQN:
|
||||
def __init__(self, n_states, n_actions, model, memory, cfg):
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = torch.device(cfg.device) # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma
|
||||
# e-greedy策略相关参数
|
||||
self.sample_count = 0
|
||||
self.epsilon_start = cfg.epsilon_start
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = model.to(self.device)
|
||||
self.target_net = model.to(self.device)
|
||||
def __init__(self,models, memories, cfg):
|
||||
self.n_actions = cfg['n_actions']
|
||||
self.device = torch.device(cfg['device'])
|
||||
self.gamma = cfg['gamma']
|
||||
## e-greedy parameters
|
||||
self.sample_count = 0 # sample count for epsilon decay
|
||||
self.epsilon_start = cfg['epsilon_start']
|
||||
self.epsilon_end = cfg['epsilon_end']
|
||||
self.epsilon_decay = cfg['epsilon_decay']
|
||||
self.batch_size = cfg['batch_size']
|
||||
self.policy_net = models['Qnet'].to(self.device)
|
||||
self.target_net = models['Qnet'].to(self.device)
|
||||
# target_net copy from policy_net
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
target_param.data.copy_(param.data)
|
||||
# self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
||||
# 可查parameters()与state_dict()的区别,前者require_grad=True
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.loss = 0
|
||||
self.memory = memory
|
||||
# self.target_net.eval() # donnot use BatchNormalization or Dropout
|
||||
# the difference between parameters() and state_dict() is that parameters() require_grad=True
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr'])
|
||||
self.memory = memories['Memory']
|
||||
self.update_flag = False
|
||||
|
||||
def sample(self, state):
|
||||
'''选择动作
|
||||
def sample_action(self, state):
|
||||
''' sample action
|
||||
'''
|
||||
self.sample_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||
if random.random() > self.epsilon:
|
||||
with torch.no_grad():
|
||||
# 先转为张量便于丢给神经网络,state元素数据原本为float64
|
||||
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
|
||||
state = torch.tensor(
|
||||
[state], device=self.device, dtype=torch.float32)
|
||||
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
|
||||
q_value = self.policy_net(state)
|
||||
# tensor.max(1)返回每行的最大值以及对应的下标,
|
||||
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
|
||||
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
||||
action = q_value.max(1)[1].item()
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def predict(self, state):
|
||||
'''选择动作
|
||||
def predict_action(self, state):
|
||||
''' predict action
|
||||
'''
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
|
||||
q_value = self.policy_net(state)
|
||||
action = q_value.max(1)[1].item()
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
|
||||
if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
|
||||
return
|
||||
# 从memory中随机采样transition
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
else:
|
||||
if not self.update_flag:
|
||||
print("Begin to update!")
|
||||
self.update_flag = True
|
||||
# sample a batch of transitions from replay buffer
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
|
||||
# convert to tensor
|
||||
state_batch = torch.tensor(
|
||||
state_batch, device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
|
||||
1) # 例如tensor([[1],...,[0]])
|
||||
reward_batch = torch.tensor(
|
||||
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
|
||||
next_state_batch = torch.tensor(
|
||||
next_state_batch, device=self.device, dtype=torch.float)
|
||||
|
||||
done_batch = torch.tensor(np.float32(
|
||||
done_batch), device=self.device) # 将bool转为float然后转为张量
|
||||
# 计算当前(s_t,a)对应的Q(s_t, a)
|
||||
q_values = self.policy_net(state_batch)
|
||||
next_q_values = self.policy_net(next_state_batch)
|
||||
# 代入当前选择的action,得到Q(s_t|a=a_t)
|
||||
q_value = q_values.gather(dim=1, index=action_batch)
|
||||
'''以下是Nature DQN的q_target计算方式
|
||||
# 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数
|
||||
next_q_state_value = self.target_net(
|
||||
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
|
||||
# 计算 q_target
|
||||
# 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
|
||||
'''
|
||||
'''以下是Double DQN q_target计算方式,与NatureDQN稍有不同'''
|
||||
next_target_values = self.target_net(
|
||||
next_state_batch)
|
||||
# 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a))
|
||||
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
|
||||
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
|
||||
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
|
||||
# 优化模型
|
||||
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
|
||||
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
|
||||
self.loss.backward()
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
# compute current Q(s_t|a=a_t)
|
||||
q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
|
||||
next_q_value_batch = self.policy_net(next_state_batch)
|
||||
'''the following is the way of computing Double DQN expected_q_value,a bit different from Nature DQN'''
|
||||
next_target_value_batch = self.target_net(next_state_batch)
|
||||
# choose action a from Q(s_t‘, a), next_target_values obtain next_q_value,which is Q’(s_t|a=argmax Q(s_t‘, a))
|
||||
next_target_q_value_batch = next_target_value_batch.gather(1, torch.max(next_q_value_batch, 1)[1].unsqueeze(1)) # shape(batchsize,1)
|
||||
expected_q_value_batch = reward_batch + self.gamma * next_target_q_value_batch * (1-done_batch)
|
||||
loss = nn.MSELoss()(q_value_batch , expected_q_value_batch)
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
# clip to avoid gradient explosion
|
||||
for param in self.policy_net.parameters():
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step() # 更新模型
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self,path):
|
||||
def save_model(self,path):
|
||||
from pathlib import Path
|
||||
# create path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
|
||||
|
||||
def load(self,path):
|
||||
def load_model(self,path):
|
||||
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
|
||||
129
projects/codes/DoubleDQN/main.py
Normal file
129
projects/codes/DoubleDQN/main.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-11-07 18:10:37
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-08-29 23:33:31
|
||||
Discription:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
|
||||
import gym
|
||||
import datetime
|
||||
import argparse
|
||||
|
||||
from common.utils import all_seed
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBufferQue
|
||||
from DoubleDQN.double_dqn import DoubleDQN
|
||||
from common.launcher import Launcher
|
||||
from envs.register import register_env
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
''' hyperparameters
|
||||
'''
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=1,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(self,cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
models = {'Qnet':MLP(n_states,n_actions,hidden_dim=cfg['hidden_dim'])}
|
||||
memories = {'Memory':ReplayBufferQue(cfg['memory_capacity'])}
|
||||
agent = DoubleDQN(models,memories,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(self,cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg["train_eps"]):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action = agent.sample_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
agent.memory.push((state, action, reward, next_state, done))
|
||||
state = next_state
|
||||
agent.update()
|
||||
if done:
|
||||
break
|
||||
if i_ep % cfg['target_update'] == 0:
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep+1)%10 == 0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
env.close()
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
def test(self,cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action = agent.predict_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
if __name__ == "__main__":
|
||||
main = Main()
|
||||
main.run()
|
||||
Binary file not shown.
@@ -1 +0,0 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}
|
||||
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 34 KiB |
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 43 KiB |
Binary file not shown.
@@ -0,0 +1 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/models/", "n_states": 4, "n_actions": 2}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 53 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,145.0,0
|
||||
1,166.0,0
|
||||
2,171.0,0
|
||||
3,200.0,0
|
||||
4,139.0,0
|
||||
5,200.0,0
|
||||
6,200.0,0
|
||||
7,141.0,0
|
||||
8,200.0,0
|
||||
9,187.0,0
|
||||
10,166.0,0
|
||||
11,172.0,0
|
||||
12,121.0,0
|
||||
13,200.0,0
|
||||
14,200.0,0
|
||||
15,149.0,0
|
||||
16,128.0,0
|
||||
17,200.0,0
|
||||
18,178.0,0
|
||||
19,185.0,0
|
||||
|
Binary file not shown.
|
After Width: | Height: | Size: 65 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards,steps
|
||||
0,19.0,0
|
||||
1,16.0,0
|
||||
2,17.0,0
|
||||
3,11.0,0
|
||||
4,10.0,0
|
||||
5,27.0,0
|
||||
6,16.0,0
|
||||
7,9.0,0
|
||||
8,20.0,0
|
||||
9,21.0,0
|
||||
10,15.0,0
|
||||
11,10.0,0
|
||||
12,14.0,0
|
||||
13,37.0,0
|
||||
14,12.0,0
|
||||
15,10.0,0
|
||||
16,27.0,0
|
||||
17,33.0,0
|
||||
18,19.0,0
|
||||
19,13.0,0
|
||||
20,26.0,0
|
||||
21,15.0,0
|
||||
22,29.0,0
|
||||
23,11.0,0
|
||||
24,20.0,0
|
||||
25,23.0,0
|
||||
26,23.0,0
|
||||
27,26.0,0
|
||||
28,17.0,0
|
||||
29,33.0,0
|
||||
30,16.0,0
|
||||
31,48.0,0
|
||||
32,48.0,0
|
||||
33,69.0,0
|
||||
34,58.0,0
|
||||
35,24.0,0
|
||||
36,18.0,0
|
||||
37,28.0,0
|
||||
38,12.0,0
|
||||
39,12.0,0
|
||||
40,18.0,0
|
||||
41,12.0,0
|
||||
42,13.0,0
|
||||
43,21.0,0
|
||||
44,30.0,0
|
||||
45,32.0,0
|
||||
46,22.0,0
|
||||
47,18.0,0
|
||||
48,12.0,0
|
||||
49,12.0,0
|
||||
50,20.0,0
|
||||
51,32.0,0
|
||||
52,15.0,0
|
||||
53,100.0,0
|
||||
54,26.0,0
|
||||
55,25.0,0
|
||||
56,18.0,0
|
||||
57,15.0,0
|
||||
58,35.0,0
|
||||
59,12.0,0
|
||||
60,65.0,0
|
||||
61,27.0,0
|
||||
62,29.0,0
|
||||
63,22.0,0
|
||||
64,83.0,0
|
||||
65,24.0,0
|
||||
66,28.0,0
|
||||
67,15.0,0
|
||||
68,43.0,0
|
||||
69,13.0,0
|
||||
70,22.0,0
|
||||
71,46.0,0
|
||||
72,14.0,0
|
||||
73,32.0,0
|
||||
74,44.0,0
|
||||
75,53.0,0
|
||||
76,31.0,0
|
||||
77,51.0,0
|
||||
78,61.0,0
|
||||
79,30.0,0
|
||||
80,36.0,0
|
||||
81,30.0,0
|
||||
82,48.0,0
|
||||
83,26.0,0
|
||||
84,27.0,0
|
||||
85,43.0,0
|
||||
86,20.0,0
|
||||
87,87.0,0
|
||||
88,71.0,0
|
||||
89,43.0,0
|
||||
90,57.0,0
|
||||
91,40.0,0
|
||||
92,37.0,0
|
||||
93,43.0,0
|
||||
94,31.0,0
|
||||
95,45.0,0
|
||||
96,47.0,0
|
||||
97,52.0,0
|
||||
98,48.0,0
|
||||
99,98.0,0
|
||||
100,49.0,0
|
||||
101,98.0,0
|
||||
102,68.0,0
|
||||
103,70.0,0
|
||||
104,74.0,0
|
||||
105,73.0,0
|
||||
106,127.0,0
|
||||
107,92.0,0
|
||||
108,70.0,0
|
||||
109,97.0,0
|
||||
110,66.0,0
|
||||
111,112.0,0
|
||||
112,138.0,0
|
||||
113,81.0,0
|
||||
114,74.0,0
|
||||
115,153.0,0
|
||||
116,113.0,0
|
||||
117,88.0,0
|
||||
118,138.0,0
|
||||
119,200.0,0
|
||||
120,84.0,0
|
||||
121,123.0,0
|
||||
122,158.0,0
|
||||
123,171.0,0
|
||||
124,137.0,0
|
||||
125,143.0,0
|
||||
126,170.0,0
|
||||
127,127.0,0
|
||||
128,118.0,0
|
||||
129,200.0,0
|
||||
130,189.0,0
|
||||
131,149.0,0
|
||||
132,137.0,0
|
||||
133,115.0,0
|
||||
134,153.0,0
|
||||
135,136.0,0
|
||||
136,140.0,0
|
||||
137,169.0,0
|
||||
138,187.0,0
|
||||
139,200.0,0
|
||||
140,196.0,0
|
||||
141,200.0,0
|
||||
142,200.0,0
|
||||
143,137.0,0
|
||||
144,200.0,0
|
||||
145,185.0,0
|
||||
146,200.0,0
|
||||
147,164.0,0
|
||||
148,200.0,0
|
||||
149,143.0,0
|
||||
150,143.0,0
|
||||
151,112.0,0
|
||||
152,192.0,0
|
||||
153,200.0,0
|
||||
154,144.0,0
|
||||
155,188.0,0
|
||||
156,200.0,0
|
||||
157,133.0,0
|
||||
158,200.0,0
|
||||
159,143.0,0
|
||||
160,158.0,0
|
||||
161,161.0,0
|
||||
162,169.0,0
|
||||
163,176.0,0
|
||||
164,200.0,0
|
||||
165,149.0,0
|
||||
166,156.0,0
|
||||
167,200.0,0
|
||||
168,200.0,0
|
||||
169,200.0,0
|
||||
170,134.0,0
|
||||
171,171.0,0
|
||||
172,200.0,0
|
||||
173,200.0,0
|
||||
174,200.0,0
|
||||
175,194.0,0
|
||||
176,200.0,0
|
||||
177,138.0,0
|
||||
178,159.0,0
|
||||
179,187.0,0
|
||||
180,200.0,0
|
||||
181,192.0,0
|
||||
182,200.0,0
|
||||
183,200.0,0
|
||||
184,200.0,0
|
||||
185,173.0,0
|
||||
186,200.0,0
|
||||
187,178.0,0
|
||||
188,176.0,0
|
||||
189,196.0,0
|
||||
190,200.0,0
|
||||
191,195.0,0
|
||||
192,158.0,0
|
||||
193,156.0,0
|
||||
194,200.0,0
|
||||
195,200.0,0
|
||||
196,200.0,0
|
||||
197,200.0,0
|
||||
198,193.0,0
|
||||
199,200.0,0
|
||||
|
Binary file not shown.
@@ -0,0 +1 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/models/", "n_states": 4, "n_actions": 2}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,200.0,0
|
||||
1,200.0,0
|
||||
2,200.0,0
|
||||
3,200.0,0
|
||||
4,191.0,0
|
||||
5,200.0,0
|
||||
6,200.0,0
|
||||
7,179.0,0
|
||||
8,200.0,0
|
||||
9,200.0,0
|
||||
10,200.0,0
|
||||
11,190.0,0
|
||||
12,147.0,0
|
||||
13,197.0,0
|
||||
14,200.0,0
|
||||
15,200.0,0
|
||||
16,167.0,0
|
||||
17,200.0,0
|
||||
18,200.0,0
|
||||
19,200.0,0
|
||||
|
Binary file not shown.
|
After Width: | Height: | Size: 65 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards,steps
|
||||
0,19.0,0
|
||||
1,16.0,0
|
||||
2,17.0,0
|
||||
3,11.0,0
|
||||
4,10.0,0
|
||||
5,27.0,0
|
||||
6,55.0,0
|
||||
7,17.0,0
|
||||
8,23.0,0
|
||||
9,9.0,0
|
||||
10,17.0,0
|
||||
11,14.0,0
|
||||
12,17.0,0
|
||||
13,12.0,0
|
||||
14,14.0,0
|
||||
15,16.0,0
|
||||
16,27.0,0
|
||||
17,36.0,0
|
||||
18,17.0,0
|
||||
19,17.0,0
|
||||
20,21.0,0
|
||||
21,23.0,0
|
||||
22,13.0,0
|
||||
23,12.0,0
|
||||
24,17.0,0
|
||||
25,26.0,0
|
||||
26,25.0,0
|
||||
27,17.0,0
|
||||
28,10.0,0
|
||||
29,16.0,0
|
||||
30,14.0,0
|
||||
31,19.0,0
|
||||
32,23.0,0
|
||||
33,37.0,0
|
||||
34,29.0,0
|
||||
35,22.0,0
|
||||
36,29.0,0
|
||||
37,15.0,0
|
||||
38,16.0,0
|
||||
39,18.0,0
|
||||
40,23.0,0
|
||||
41,16.0,0
|
||||
42,26.0,0
|
||||
43,13.0,0
|
||||
44,24.0,0
|
||||
45,39.0,0
|
||||
46,23.0,0
|
||||
47,32.0,0
|
||||
48,123.0,0
|
||||
49,18.0,0
|
||||
50,39.0,0
|
||||
51,17.0,0
|
||||
52,28.0,0
|
||||
53,34.0,0
|
||||
54,26.0,0
|
||||
55,61.0,0
|
||||
56,28.0,0
|
||||
57,16.0,0
|
||||
58,45.0,0
|
||||
59,41.0,0
|
||||
60,49.0,0
|
||||
61,18.0,0
|
||||
62,40.0,0
|
||||
63,24.0,0
|
||||
64,37.0,0
|
||||
65,26.0,0
|
||||
66,51.0,0
|
||||
67,17.0,0
|
||||
68,152.0,0
|
||||
69,17.0,0
|
||||
70,29.0,0
|
||||
71,37.0,0
|
||||
72,15.0,0
|
||||
73,55.0,0
|
||||
74,152.0,0
|
||||
75,23.0,0
|
||||
76,45.0,0
|
||||
77,30.0,0
|
||||
78,39.0,0
|
||||
79,20.0,0
|
||||
80,53.0,0
|
||||
81,49.0,0
|
||||
82,71.0,0
|
||||
83,115.0,0
|
||||
84,41.0,0
|
||||
85,52.0,0
|
||||
86,52.0,0
|
||||
87,36.0,0
|
||||
88,84.0,0
|
||||
89,122.0,0
|
||||
90,49.0,0
|
||||
91,200.0,0
|
||||
92,67.0,0
|
||||
93,87.0,0
|
||||
94,183.0,0
|
||||
95,132.0,0
|
||||
96,76.0,0
|
||||
97,200.0,0
|
||||
98,200.0,0
|
||||
99,200.0,0
|
||||
100,200.0,0
|
||||
101,200.0,0
|
||||
102,106.0,0
|
||||
103,192.0,0
|
||||
104,111.0,0
|
||||
105,95.0,0
|
||||
106,200.0,0
|
||||
107,200.0,0
|
||||
108,148.0,0
|
||||
109,200.0,0
|
||||
110,97.0,0
|
||||
111,200.0,0
|
||||
112,200.0,0
|
||||
113,105.0,0
|
||||
114,135.0,0
|
||||
115,200.0,0
|
||||
116,144.0,0
|
||||
117,156.0,0
|
||||
118,200.0,0
|
||||
119,200.0,0
|
||||
120,166.0,0
|
||||
121,200.0,0
|
||||
122,200.0,0
|
||||
123,200.0,0
|
||||
124,200.0,0
|
||||
125,200.0,0
|
||||
126,200.0,0
|
||||
127,158.0,0
|
||||
128,139.0,0
|
||||
129,200.0,0
|
||||
130,200.0,0
|
||||
131,200.0,0
|
||||
132,200.0,0
|
||||
133,122.0,0
|
||||
134,200.0,0
|
||||
135,188.0,0
|
||||
136,200.0,0
|
||||
137,183.0,0
|
||||
138,200.0,0
|
||||
139,200.0,0
|
||||
140,200.0,0
|
||||
141,200.0,0
|
||||
142,200.0,0
|
||||
143,158.0,0
|
||||
144,200.0,0
|
||||
145,200.0,0
|
||||
146,200.0,0
|
||||
147,191.0,0
|
||||
148,200.0,0
|
||||
149,194.0,0
|
||||
150,178.0,0
|
||||
151,200.0,0
|
||||
152,200.0,0
|
||||
153,200.0,0
|
||||
154,162.0,0
|
||||
155,200.0,0
|
||||
156,200.0,0
|
||||
157,128.0,0
|
||||
158,200.0,0
|
||||
159,184.0,0
|
||||
160,194.0,0
|
||||
161,200.0,0
|
||||
162,200.0,0
|
||||
163,200.0,0
|
||||
164,200.0,0
|
||||
165,160.0,0
|
||||
166,163.0,0
|
||||
167,200.0,0
|
||||
168,200.0,0
|
||||
169,200.0,0
|
||||
170,141.0,0
|
||||
171,200.0,0
|
||||
172,200.0,0
|
||||
173,200.0,0
|
||||
174,200.0,0
|
||||
175,200.0,0
|
||||
176,200.0,0
|
||||
177,157.0,0
|
||||
178,164.0,0
|
||||
179,200.0,0
|
||||
180,200.0,0
|
||||
181,200.0,0
|
||||
182,200.0,0
|
||||
183,200.0,0
|
||||
184,200.0,0
|
||||
185,193.0,0
|
||||
186,182.0,0
|
||||
187,200.0,0
|
||||
188,200.0,0
|
||||
189,200.0,0
|
||||
190,200.0,0
|
||||
191,200.0,0
|
||||
192,174.0,0
|
||||
193,178.0,0
|
||||
194,200.0,0
|
||||
195,200.0,0
|
||||
196,200.0,0
|
||||
197,200.0,0
|
||||
198,200.0,0
|
||||
199,200.0,0
|
||||
|
@@ -1,125 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-11-07 18:10:37
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-07-21 21:52:31
|
||||
Discription:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import argparse
|
||||
|
||||
from common.utils import save_results,make_dir
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer
|
||||
from DoubleDQN.double_dqn import DoubleDQN
|
||||
|
||||
def get_args():
|
||||
""" 超参数
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # 保存模型的路径
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
|
||||
memory = ReplayBuffer(cfg.memory_capacity)
|
||||
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print("开始训练!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.sample(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
agent.memory.push(state, action, reward, next_state, done)
|
||||
state = next_state
|
||||
agent.update()
|
||||
if done:
|
||||
break
|
||||
if i_ep % cfg.target_update == 0:
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
if (i_ep+1)%10 == 0:
|
||||
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f},Epislon:{agent.epsilon:.3f}')
|
||||
rewards.append(ep_reward)
|
||||
print("完成训练!")
|
||||
return {'rewards':rewards}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print("开始测试!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
while True:
|
||||
action = agent.predict(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
|
||||
print("完成测试!")
|
||||
return {'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg,seed=1)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # 保存参数
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg,seed=1)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
||||
15
projects/codes/scripts/DoubleDQN_CartPole-v0.sh
Normal file
15
projects/codes/scripts/DoubleDQN_CartPole-v0.sh
Normal file
@@ -0,0 +1,15 @@
|
||||
# run Double DQN on CartPole-v0
|
||||
# source conda, if you are already in proper conda environment, then comment the codes util "conda activate easyrl"
|
||||
|
||||
if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then
|
||||
echo "source file at ~/anaconda3/etc/profile.d/conda.sh"
|
||||
source ~/anaconda3/etc/profile.d/conda.sh
|
||||
elif [ -f "$HOME/opt/anaconda3/etc/profile.d/conda.sh" ]; then
|
||||
echo "source file at ~/opt/anaconda3/etc/profile.d/conda.sh"
|
||||
source ~/opt/anaconda3/etc/profile.d/conda.sh
|
||||
else
|
||||
echo 'please manually config the conda source path'
|
||||
fi
|
||||
conda activate easyrl # easyrl here can be changed to another name of conda env that you have created
|
||||
codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path
|
||||
python $codes_dir/DoubleDQN/main.py --device cuda
|
||||
Reference in New Issue
Block a user