hot update Double DQN
This commit is contained in:
@@ -5,7 +5,7 @@
|
|||||||
@Email: johnjim0816@gmail.com
|
@Email: johnjim0816@gmail.com
|
||||||
@Date: 2020-06-12 00:50:49
|
@Date: 2020-06-12 00:50:49
|
||||||
@LastEditor: John
|
@LastEditor: John
|
||||||
LastEditTime: 2022-08-23 23:59:54
|
LastEditTime: 2022-08-29 23:30:08
|
||||||
@Discription:
|
@Discription:
|
||||||
@Environment: python 3.7.7
|
@Environment: python 3.7.7
|
||||||
'''
|
'''
|
||||||
@@ -78,7 +78,7 @@ class DQN:
|
|||||||
self.batch_size)
|
self.batch_size)
|
||||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize)
|
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||||
# print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape)
|
# print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape)
|
||||||
@@ -91,7 +91,7 @@ class DQN:
|
|||||||
# compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly
|
# compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly
|
||||||
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
|
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
|
||||||
# print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad)
|
# print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad)
|
||||||
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
||||||
# backpropagation
|
# backpropagation
|
||||||
self.optimizer.zero_grad()
|
self.optimizer.zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
|||||||
@@ -9,130 +9,122 @@ import torch
|
|||||||
import datetime
|
import datetime
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import argparse
|
import argparse
|
||||||
from common.utils import save_results,all_seed
|
from common.utils import all_seed
|
||||||
from common.utils import plot_rewards,save_args
|
|
||||||
from common.models import MLP
|
from common.models import MLP
|
||||||
from common.memories import ReplayBuffer
|
from common.memories import ReplayBuffer
|
||||||
|
from common.launcher import Launcher
|
||||||
|
from envs.register import register_env
|
||||||
from dqn import DQN
|
from dqn import DQN
|
||||||
|
class Main(Launcher):
|
||||||
|
def get_args(self):
|
||||||
|
""" hyperparameters
|
||||||
|
"""
|
||||||
|
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||||
|
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||||
|
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
||||||
|
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||||
|
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||||
|
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||||
|
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||||
|
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||||
|
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||||
|
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||||
|
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
|
||||||
|
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||||
|
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||||
|
parser.add_argument('--batch_size',default=64,type=int)
|
||||||
|
parser.add_argument('--target_update',default=4,type=int)
|
||||||
|
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||||
|
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||||
|
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||||
|
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||||
|
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||||
|
# please manually change the following args in this script if you want
|
||||||
|
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||||
|
'/' + curr_time + '/results' )
|
||||||
|
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||||
|
'/' + curr_time + '/models' )
|
||||||
|
args = parser.parse_args()
|
||||||
|
args = {**vars(args)} # type(dict)
|
||||||
|
return args
|
||||||
|
|
||||||
def get_args():
|
def env_agent_config(cfg):
|
||||||
""" hyperparameters
|
''' create env and agent
|
||||||
"""
|
'''
|
||||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
register_env(cfg['env_name'])
|
||||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
env = gym.make(cfg['env_name'])
|
||||||
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
if cfg['seed'] !=0: # set random seed
|
||||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
all_seed(env,seed=cfg["seed"])
|
||||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
try: # state dimension
|
||||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
except AttributeError:
|
||||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
n_actions = env.action_space.n # action dimension
|
||||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
|
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
|
||||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
|
||||||
parser.add_argument('--batch_size',default=64,type=int)
|
agent = DQN(model,memory,cfg) # create agent
|
||||||
parser.add_argument('--target_update',default=4,type=int)
|
return env, agent
|
||||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
|
||||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
|
||||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
|
||||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
|
||||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
|
||||||
# please manually change the following args in this script if you want
|
|
||||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
|
||||||
'/' + curr_time + '/results' )
|
|
||||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
|
||||||
'/' + curr_time + '/models' )
|
|
||||||
args = parser.parse_args()
|
|
||||||
args = {**vars(args)} # type(dict)
|
|
||||||
return args
|
|
||||||
|
|
||||||
def env_agent_config(cfg):
|
def train(cfg, env, agent):
|
||||||
''' create env and agent
|
''' 训练
|
||||||
'''
|
'''
|
||||||
env = gym.make(cfg['env_name']) # create env
|
print("Start training!")
|
||||||
if cfg['seed'] !=0: # set random seed
|
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||||
all_seed(env,seed=cfg["seed"])
|
rewards = [] # record rewards for all episodes
|
||||||
n_states = env.observation_space.shape[0] # state dimension
|
steps = []
|
||||||
n_actions = env.action_space.n # action dimension
|
for i_ep in range(cfg["train_eps"]):
|
||||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
ep_reward = 0 # reward per episode
|
||||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
ep_step = 0
|
||||||
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
|
state = env.reset() # reset and obtain initial state
|
||||||
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
|
for _ in range(cfg['ep_max_steps']):
|
||||||
agent = DQN(model,memory,cfg) # create agent
|
ep_step += 1
|
||||||
return env, agent
|
action = agent.sample_action(state) # sample action
|
||||||
|
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||||
|
agent.memory.push(state, action, reward,
|
||||||
|
next_state, done) # save transitions
|
||||||
|
state = next_state # update next state for env
|
||||||
|
agent.update() # update agent
|
||||||
|
ep_reward += reward #
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
|
||||||
|
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||||
|
steps.append(ep_step)
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
if (i_ep + 1) % 10 == 0:
|
||||||
|
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||||
|
print("Finish training!")
|
||||||
|
env.close()
|
||||||
|
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||||
|
return res_dic
|
||||||
|
|
||||||
def train(cfg, env, agent):
|
def test(cfg, env, agent):
|
||||||
''' 训练
|
print("Start testing!")
|
||||||
'''
|
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||||
print("Start training!")
|
rewards = [] # record rewards for all episodes
|
||||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
steps = []
|
||||||
rewards = [] # record rewards for all episodes
|
for i_ep in range(cfg['test_eps']):
|
||||||
steps = []
|
ep_reward = 0 # reward per episode
|
||||||
for i_ep in range(cfg["train_eps"]):
|
ep_step = 0
|
||||||
ep_reward = 0 # reward per episode
|
state = env.reset() # reset and obtain initial state
|
||||||
ep_step = 0
|
for _ in range(cfg['ep_max_steps']):
|
||||||
state = env.reset() # reset and obtain initial state
|
ep_step+=1
|
||||||
for _ in range(cfg['ep_max_steps']):
|
action = agent.predict_action(state) # predict action
|
||||||
ep_step += 1
|
next_state, reward, done, _ = env.step(action)
|
||||||
action = agent.sample_action(state) # sample action
|
state = next_state
|
||||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
ep_reward += reward
|
||||||
agent.memory.push(state, action, reward,
|
if done:
|
||||||
next_state, done) # save transitions
|
break
|
||||||
state = next_state # update next state for env
|
steps.append(ep_step)
|
||||||
agent.update() # update agent
|
rewards.append(ep_reward)
|
||||||
ep_reward += reward #
|
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||||
if done:
|
print("Finish testing!")
|
||||||
break
|
env.close()
|
||||||
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
|
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
|
||||||
steps.append(ep_step)
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
if (i_ep + 1) % 10 == 0:
|
|
||||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
|
||||||
print("Finish training!")
|
|
||||||
env.close()
|
|
||||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
|
||||||
return res_dic
|
|
||||||
|
|
||||||
def test(cfg, env, agent):
|
|
||||||
print("Start testing!")
|
|
||||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
|
||||||
rewards = [] # record rewards for all episodes
|
|
||||||
steps = []
|
|
||||||
for i_ep in range(cfg['test_eps']):
|
|
||||||
ep_reward = 0 # reward per episode
|
|
||||||
ep_step = 0
|
|
||||||
state = env.reset() # reset and obtain initial state
|
|
||||||
for _ in range(cfg['ep_max_steps']):
|
|
||||||
ep_step+=1
|
|
||||||
action = agent.predict_action(state) # predict action
|
|
||||||
next_state, reward, done, _ = env.step(action)
|
|
||||||
state = next_state
|
|
||||||
ep_reward += reward
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
steps.append(ep_step)
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
|
||||||
print("Finish testing!")
|
|
||||||
env.close()
|
|
||||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cfg = get_args()
|
main = Main()
|
||||||
# training
|
main.run()
|
||||||
env, agent = env_agent_config(cfg)
|
|
||||||
res_dic = train(cfg, env, agent)
|
|
||||||
save_args(cfg,path = cfg['result_path']) # save parameters
|
|
||||||
agent.save_model(path = cfg['model_path']) # save models
|
|
||||||
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
|
|
||||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
|
|
||||||
# testing
|
|
||||||
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
|
|
||||||
agent.load_model(path = cfg['model_path']) # load model
|
|
||||||
res_dic = test(cfg, env, agent)
|
|
||||||
save_results(res_dic, tag='test',
|
|
||||||
path = cfg['result_path'])
|
|
||||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
|
|
||||||
|
|||||||
@@ -1 +1,21 @@
|
|||||||
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 10, "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", "show_fig": false, "save_fig": true}
|
{
|
||||||
|
"algo_name": "DQN",
|
||||||
|
"env_name": "CartPole-v0",
|
||||||
|
"train_eps": 200,
|
||||||
|
"test_eps": 20,
|
||||||
|
"gamma": 0.95,
|
||||||
|
"epsilon_start": 0.95,
|
||||||
|
"epsilon_end": 0.01,
|
||||||
|
"epsilon_decay": 500,
|
||||||
|
"lr": 0.0001,
|
||||||
|
"memory_capacity": 100000,
|
||||||
|
"batch_size": 64,
|
||||||
|
"target_update": 4,
|
||||||
|
"hidden_dim": 256,
|
||||||
|
"device": "cpu",
|
||||||
|
"seed": 10,
|
||||||
|
"result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results",
|
||||||
|
"model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models",
|
||||||
|
"show_fig": false,
|
||||||
|
"save_fig": true
|
||||||
|
}
|
||||||
@@ -1 +1,24 @@
|
|||||||
{"algo_name": "DQN", "env_name": "CartPole-v1", "train_eps": 2000, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 6000, "lr": 1e-05, "memory_capacity": 200000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models", "n_states": 4, "n_actions": 2}
|
{
|
||||||
|
"algo_name": "DQN",
|
||||||
|
"env_name": "CartPole-v1",
|
||||||
|
"train_eps": 2000,
|
||||||
|
"test_eps": 20,
|
||||||
|
"ep_max_steps": 100000,
|
||||||
|
"gamma": 0.99,
|
||||||
|
"epsilon_start": 0.95,
|
||||||
|
"epsilon_end": 0.01,
|
||||||
|
"epsilon_decay": 6000,
|
||||||
|
"lr": 1e-05,
|
||||||
|
"memory_capacity": 200000,
|
||||||
|
"batch_size": 64,
|
||||||
|
"target_update": 4,
|
||||||
|
"hidden_dim": 256,
|
||||||
|
"device": "cuda",
|
||||||
|
"seed": 10,
|
||||||
|
"show_fig": false,
|
||||||
|
"save_fig": true,
|
||||||
|
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results",
|
||||||
|
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models",
|
||||||
|
"n_states": 4,
|
||||||
|
"n_actions": 2
|
||||||
|
}
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
@Email: johnjim0816@gmail.com
|
@Email: johnjim0816@gmail.com
|
||||||
@Date: 2020-06-12 00:50:49
|
@Date: 2020-06-12 00:50:49
|
||||||
@LastEditor: John
|
@LastEditor: John
|
||||||
LastEditTime: 2022-07-21 00:08:26
|
LastEditTime: 2022-08-29 23:34:20
|
||||||
@Discription:
|
@Discription:
|
||||||
@Environment: python 3.7.7
|
@Environment: python 3.7.7
|
||||||
'''
|
'''
|
||||||
@@ -20,148 +20,87 @@ import torch.nn.functional as F
|
|||||||
import random
|
import random
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
class ReplayBuffer:
|
|
||||||
def __init__(self, capacity):
|
|
||||||
self.capacity = capacity # 经验回放的容量
|
|
||||||
self.buffer = [] # 缓冲区
|
|
||||||
self.position = 0
|
|
||||||
|
|
||||||
def push(self, state, action, reward, next_state, done):
|
|
||||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
|
||||||
'''
|
|
||||||
if len(self.buffer) < self.capacity:
|
|
||||||
self.buffer.append(None)
|
|
||||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
|
||||||
self.position = (self.position + 1) % self.capacity
|
|
||||||
|
|
||||||
def sample(self, batch_size):
|
|
||||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
|
||||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
|
||||||
return state, action, reward, next_state, done
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
''' 返回当前存储的量
|
|
||||||
'''
|
|
||||||
return len(self.buffer)
|
|
||||||
|
|
||||||
class MLP(nn.Module):
|
|
||||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
|
||||||
""" 初始化q网络,为全连接网络
|
|
||||||
n_states: 输入的特征数即环境的状态维度
|
|
||||||
n_actions: 输出的动作维度
|
|
||||||
"""
|
|
||||||
super(MLP, self).__init__()
|
|
||||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
|
||||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
|
||||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
# 各层对应的激活函数
|
|
||||||
x = F.relu(self.fc1(x))
|
|
||||||
x = F.relu(self.fc2(x))
|
|
||||||
return self.fc3(x)
|
|
||||||
|
|
||||||
class DoubleDQN:
|
class DoubleDQN:
|
||||||
def __init__(self, n_states, n_actions, model, memory, cfg):
|
def __init__(self,models, memories, cfg):
|
||||||
self.n_actions = n_actions # 总的动作个数
|
self.n_actions = cfg['n_actions']
|
||||||
self.device = torch.device(cfg.device) # 设备,cpu或gpu等
|
self.device = torch.device(cfg['device'])
|
||||||
self.gamma = cfg.gamma
|
self.gamma = cfg['gamma']
|
||||||
# e-greedy策略相关参数
|
## e-greedy parameters
|
||||||
self.sample_count = 0
|
self.sample_count = 0 # sample count for epsilon decay
|
||||||
self.epsilon_start = cfg.epsilon_start
|
self.epsilon_start = cfg['epsilon_start']
|
||||||
self.epsilon_end = cfg.epsilon_end
|
self.epsilon_end = cfg['epsilon_end']
|
||||||
self.epsilon_decay = cfg.epsilon_decay
|
self.epsilon_decay = cfg['epsilon_decay']
|
||||||
self.batch_size = cfg.batch_size
|
self.batch_size = cfg['batch_size']
|
||||||
self.policy_net = model.to(self.device)
|
self.policy_net = models['Qnet'].to(self.device)
|
||||||
self.target_net = model.to(self.device)
|
self.target_net = models['Qnet'].to(self.device)
|
||||||
# target_net copy from policy_net
|
# target_net copy from policy_net
|
||||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||||
target_param.data.copy_(param.data)
|
target_param.data.copy_(param.data)
|
||||||
# self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
# self.target_net.eval() # donnot use BatchNormalization or Dropout
|
||||||
# 可查parameters()与state_dict()的区别,前者require_grad=True
|
# the difference between parameters() and state_dict() is that parameters() require_grad=True
|
||||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr'])
|
||||||
self.loss = 0
|
self.memory = memories['Memory']
|
||||||
self.memory = memory
|
self.update_flag = False
|
||||||
|
|
||||||
def sample(self, state):
|
def sample_action(self, state):
|
||||||
'''选择动作
|
''' sample action
|
||||||
'''
|
'''
|
||||||
self.sample_count += 1
|
self.sample_count += 1
|
||||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||||
if random.random() > self.epsilon:
|
if random.random() > self.epsilon:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# 先转为张量便于丢给神经网络,state元素数据原本为float64
|
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
|
||||||
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
|
|
||||||
state = torch.tensor(
|
|
||||||
[state], device=self.device, dtype=torch.float32)
|
|
||||||
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
|
|
||||||
q_value = self.policy_net(state)
|
q_value = self.policy_net(state)
|
||||||
# tensor.max(1)返回每行的最大值以及对应的下标,
|
|
||||||
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
|
|
||||||
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
|
||||||
action = q_value.max(1)[1].item()
|
action = q_value.max(1)[1].item()
|
||||||
else:
|
else:
|
||||||
action = random.randrange(self.n_actions)
|
action = random.randrange(self.n_actions)
|
||||||
return action
|
return action
|
||||||
def predict(self, state):
|
def predict_action(self, state):
|
||||||
'''选择动作
|
''' predict action
|
||||||
'''
|
'''
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
|
||||||
q_value = self.policy_net(state)
|
q_value = self.policy_net(state)
|
||||||
action = q_value.max(1)[1].item()
|
action = q_value.max(1)[1].item()
|
||||||
return action
|
return action
|
||||||
def update(self):
|
def update(self):
|
||||||
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
|
if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
|
||||||
return
|
return
|
||||||
# 从memory中随机采样transition
|
else:
|
||||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
if not self.update_flag:
|
||||||
self.batch_size)
|
print("Begin to update!")
|
||||||
|
self.update_flag = True
|
||||||
|
# sample a batch of transitions from replay buffer
|
||||||
|
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
|
||||||
# convert to tensor
|
# convert to tensor
|
||||||
state_batch = torch.tensor(
|
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
|
||||||
state_batch, device=self.device, dtype=torch.float)
|
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
|
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||||
1) # 例如tensor([[1],...,[0]])
|
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
|
||||||
reward_batch = torch.tensor(
|
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||||
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
|
# compute current Q(s_t|a=a_t)
|
||||||
next_state_batch = torch.tensor(
|
q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
|
||||||
next_state_batch, device=self.device, dtype=torch.float)
|
next_q_value_batch = self.policy_net(next_state_batch)
|
||||||
|
'''the following is the way of computing Double DQN expected_q_value,a bit different from Nature DQN'''
|
||||||
done_batch = torch.tensor(np.float32(
|
next_target_value_batch = self.target_net(next_state_batch)
|
||||||
done_batch), device=self.device) # 将bool转为float然后转为张量
|
# choose action a from Q(s_t‘, a), next_target_values obtain next_q_value,which is Q’(s_t|a=argmax Q(s_t‘, a))
|
||||||
# 计算当前(s_t,a)对应的Q(s_t, a)
|
next_target_q_value_batch = next_target_value_batch.gather(1, torch.max(next_q_value_batch, 1)[1].unsqueeze(1)) # shape(batchsize,1)
|
||||||
q_values = self.policy_net(state_batch)
|
expected_q_value_batch = reward_batch + self.gamma * next_target_q_value_batch * (1-done_batch)
|
||||||
next_q_values = self.policy_net(next_state_batch)
|
loss = nn.MSELoss()(q_value_batch , expected_q_value_batch)
|
||||||
# 代入当前选择的action,得到Q(s_t|a=a_t)
|
self.optimizer.zero_grad()
|
||||||
q_value = q_values.gather(dim=1, index=action_batch)
|
loss.backward()
|
||||||
'''以下是Nature DQN的q_target计算方式
|
# clip to avoid gradient explosion
|
||||||
# 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数
|
for param in self.policy_net.parameters():
|
||||||
next_q_state_value = self.target_net(
|
|
||||||
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
|
|
||||||
# 计算 q_target
|
|
||||||
# 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
|
||||||
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
|
|
||||||
'''
|
|
||||||
'''以下是Double DQN q_target计算方式,与NatureDQN稍有不同'''
|
|
||||||
next_target_values = self.target_net(
|
|
||||||
next_state_batch)
|
|
||||||
# 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a))
|
|
||||||
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
|
|
||||||
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
|
|
||||||
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
|
|
||||||
# 优化模型
|
|
||||||
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
|
|
||||||
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
|
|
||||||
self.loss.backward()
|
|
||||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
|
||||||
param.grad.data.clamp_(-1, 1)
|
param.grad.data.clamp_(-1, 1)
|
||||||
self.optimizer.step() # 更新模型
|
self.optimizer.step()
|
||||||
|
|
||||||
def save(self,path):
|
def save_model(self,path):
|
||||||
|
from pathlib import Path
|
||||||
|
# create path
|
||||||
|
Path(path).mkdir(parents=True, exist_ok=True)
|
||||||
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
|
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
|
||||||
|
|
||||||
def load(self,path):
|
def load_model(self,path):
|
||||||
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
|
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
|
||||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||||
param.data.copy_(target_param.data)
|
param.data.copy_(target_param.data)
|
||||||
|
|||||||
129
projects/codes/DoubleDQN/main.py
Normal file
129
projects/codes/DoubleDQN/main.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: JiangJi
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2021-11-07 18:10:37
|
||||||
|
LastEditor: JiangJi
|
||||||
|
LastEditTime: 2022-08-29 23:33:31
|
||||||
|
Discription:
|
||||||
|
'''
|
||||||
|
import sys,os
|
||||||
|
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||||
|
parent_path = os.path.dirname(curr_path) # parent path
|
||||||
|
sys.path.append(parent_path) # add to system path
|
||||||
|
|
||||||
|
import gym
|
||||||
|
import datetime
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from common.utils import all_seed
|
||||||
|
from common.models import MLP
|
||||||
|
from common.memories import ReplayBufferQue
|
||||||
|
from DoubleDQN.double_dqn import DoubleDQN
|
||||||
|
from common.launcher import Launcher
|
||||||
|
from envs.register import register_env
|
||||||
|
class Main(Launcher):
|
||||||
|
def get_args(self):
|
||||||
|
''' hyperparameters
|
||||||
|
'''
|
||||||
|
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||||
|
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||||
|
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
||||||
|
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||||
|
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||||
|
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||||
|
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||||
|
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||||
|
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||||
|
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||||
|
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
||||||
|
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||||
|
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||||
|
parser.add_argument('--batch_size',default=64,type=int)
|
||||||
|
parser.add_argument('--target_update',default=4,type=int)
|
||||||
|
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||||
|
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||||
|
parser.add_argument('--seed',default=1,type=int,help="seed")
|
||||||
|
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||||
|
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||||
|
args = parser.parse_args()
|
||||||
|
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||||
|
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||||
|
}
|
||||||
|
args = {**vars(args),**default_args} # type(dict)
|
||||||
|
return args
|
||||||
|
def env_agent_config(self,cfg):
|
||||||
|
''' create env and agent
|
||||||
|
'''
|
||||||
|
register_env(cfg['env_name'])
|
||||||
|
env = gym.make(cfg['env_name'])
|
||||||
|
if cfg['seed'] !=0: # set random seed
|
||||||
|
all_seed(env,seed=cfg["seed"])
|
||||||
|
try: # state dimension
|
||||||
|
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||||
|
except AttributeError:
|
||||||
|
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||||
|
n_actions = env.action_space.n # action dimension
|
||||||
|
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||||
|
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||||
|
models = {'Qnet':MLP(n_states,n_actions,hidden_dim=cfg['hidden_dim'])}
|
||||||
|
memories = {'Memory':ReplayBufferQue(cfg['memory_capacity'])}
|
||||||
|
agent = DoubleDQN(models,memories,cfg)
|
||||||
|
return env,agent
|
||||||
|
|
||||||
|
def train(self,cfg,env,agent):
|
||||||
|
print("Start training!")
|
||||||
|
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||||
|
rewards = [] # record rewards for all episodes
|
||||||
|
steps = []
|
||||||
|
for i_ep in range(cfg["train_eps"]):
|
||||||
|
ep_reward = 0 # reward per episode
|
||||||
|
ep_step = 0
|
||||||
|
state = env.reset() # reset and obtain initial state
|
||||||
|
for _ in range(cfg['ep_max_steps']):
|
||||||
|
action = agent.sample_action(state)
|
||||||
|
next_state, reward, done, _ = env.step(action)
|
||||||
|
ep_reward += reward
|
||||||
|
agent.memory.push((state, action, reward, next_state, done))
|
||||||
|
state = next_state
|
||||||
|
agent.update()
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
if i_ep % cfg['target_update'] == 0:
|
||||||
|
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||||
|
steps.append(ep_step)
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
if (i_ep+1)%10 == 0:
|
||||||
|
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||||
|
print("Finish training!")
|
||||||
|
env.close()
|
||||||
|
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||||
|
return res_dic
|
||||||
|
|
||||||
|
def test(self,cfg,env,agent):
|
||||||
|
print("Start testing!")
|
||||||
|
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||||
|
rewards = [] # record rewards for all episodes
|
||||||
|
steps = []
|
||||||
|
for i_ep in range(cfg['test_eps']):
|
||||||
|
ep_reward = 0 # reward per episode
|
||||||
|
ep_step = 0
|
||||||
|
state = env.reset() # reset and obtain initial state
|
||||||
|
for _ in range(cfg['ep_max_steps']):
|
||||||
|
action = agent.predict_action(state)
|
||||||
|
next_state, reward, done, _ = env.step(action)
|
||||||
|
state = next_state
|
||||||
|
ep_reward += reward
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
steps.append(ep_step)
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||||
|
print("Finish testing!")
|
||||||
|
env.close()
|
||||||
|
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main = Main()
|
||||||
|
main.run()
|
||||||
Binary file not shown.
@@ -1 +0,0 @@
|
|||||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}
|
|
||||||
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 34 KiB |
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 43 KiB |
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/models/", "n_states": 4, "n_actions": 2}
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 53 KiB |
@@ -0,0 +1,21 @@
|
|||||||
|
episodes,rewards,steps
|
||||||
|
0,145.0,0
|
||||||
|
1,166.0,0
|
||||||
|
2,171.0,0
|
||||||
|
3,200.0,0
|
||||||
|
4,139.0,0
|
||||||
|
5,200.0,0
|
||||||
|
6,200.0,0
|
||||||
|
7,141.0,0
|
||||||
|
8,200.0,0
|
||||||
|
9,187.0,0
|
||||||
|
10,166.0,0
|
||||||
|
11,172.0,0
|
||||||
|
12,121.0,0
|
||||||
|
13,200.0,0
|
||||||
|
14,200.0,0
|
||||||
|
15,149.0,0
|
||||||
|
16,128.0,0
|
||||||
|
17,200.0,0
|
||||||
|
18,178.0,0
|
||||||
|
19,185.0,0
|
||||||
|
Binary file not shown.
|
After Width: | Height: | Size: 65 KiB |
@@ -0,0 +1,201 @@
|
|||||||
|
episodes,rewards,steps
|
||||||
|
0,19.0,0
|
||||||
|
1,16.0,0
|
||||||
|
2,17.0,0
|
||||||
|
3,11.0,0
|
||||||
|
4,10.0,0
|
||||||
|
5,27.0,0
|
||||||
|
6,16.0,0
|
||||||
|
7,9.0,0
|
||||||
|
8,20.0,0
|
||||||
|
9,21.0,0
|
||||||
|
10,15.0,0
|
||||||
|
11,10.0,0
|
||||||
|
12,14.0,0
|
||||||
|
13,37.0,0
|
||||||
|
14,12.0,0
|
||||||
|
15,10.0,0
|
||||||
|
16,27.0,0
|
||||||
|
17,33.0,0
|
||||||
|
18,19.0,0
|
||||||
|
19,13.0,0
|
||||||
|
20,26.0,0
|
||||||
|
21,15.0,0
|
||||||
|
22,29.0,0
|
||||||
|
23,11.0,0
|
||||||
|
24,20.0,0
|
||||||
|
25,23.0,0
|
||||||
|
26,23.0,0
|
||||||
|
27,26.0,0
|
||||||
|
28,17.0,0
|
||||||
|
29,33.0,0
|
||||||
|
30,16.0,0
|
||||||
|
31,48.0,0
|
||||||
|
32,48.0,0
|
||||||
|
33,69.0,0
|
||||||
|
34,58.0,0
|
||||||
|
35,24.0,0
|
||||||
|
36,18.0,0
|
||||||
|
37,28.0,0
|
||||||
|
38,12.0,0
|
||||||
|
39,12.0,0
|
||||||
|
40,18.0,0
|
||||||
|
41,12.0,0
|
||||||
|
42,13.0,0
|
||||||
|
43,21.0,0
|
||||||
|
44,30.0,0
|
||||||
|
45,32.0,0
|
||||||
|
46,22.0,0
|
||||||
|
47,18.0,0
|
||||||
|
48,12.0,0
|
||||||
|
49,12.0,0
|
||||||
|
50,20.0,0
|
||||||
|
51,32.0,0
|
||||||
|
52,15.0,0
|
||||||
|
53,100.0,0
|
||||||
|
54,26.0,0
|
||||||
|
55,25.0,0
|
||||||
|
56,18.0,0
|
||||||
|
57,15.0,0
|
||||||
|
58,35.0,0
|
||||||
|
59,12.0,0
|
||||||
|
60,65.0,0
|
||||||
|
61,27.0,0
|
||||||
|
62,29.0,0
|
||||||
|
63,22.0,0
|
||||||
|
64,83.0,0
|
||||||
|
65,24.0,0
|
||||||
|
66,28.0,0
|
||||||
|
67,15.0,0
|
||||||
|
68,43.0,0
|
||||||
|
69,13.0,0
|
||||||
|
70,22.0,0
|
||||||
|
71,46.0,0
|
||||||
|
72,14.0,0
|
||||||
|
73,32.0,0
|
||||||
|
74,44.0,0
|
||||||
|
75,53.0,0
|
||||||
|
76,31.0,0
|
||||||
|
77,51.0,0
|
||||||
|
78,61.0,0
|
||||||
|
79,30.0,0
|
||||||
|
80,36.0,0
|
||||||
|
81,30.0,0
|
||||||
|
82,48.0,0
|
||||||
|
83,26.0,0
|
||||||
|
84,27.0,0
|
||||||
|
85,43.0,0
|
||||||
|
86,20.0,0
|
||||||
|
87,87.0,0
|
||||||
|
88,71.0,0
|
||||||
|
89,43.0,0
|
||||||
|
90,57.0,0
|
||||||
|
91,40.0,0
|
||||||
|
92,37.0,0
|
||||||
|
93,43.0,0
|
||||||
|
94,31.0,0
|
||||||
|
95,45.0,0
|
||||||
|
96,47.0,0
|
||||||
|
97,52.0,0
|
||||||
|
98,48.0,0
|
||||||
|
99,98.0,0
|
||||||
|
100,49.0,0
|
||||||
|
101,98.0,0
|
||||||
|
102,68.0,0
|
||||||
|
103,70.0,0
|
||||||
|
104,74.0,0
|
||||||
|
105,73.0,0
|
||||||
|
106,127.0,0
|
||||||
|
107,92.0,0
|
||||||
|
108,70.0,0
|
||||||
|
109,97.0,0
|
||||||
|
110,66.0,0
|
||||||
|
111,112.0,0
|
||||||
|
112,138.0,0
|
||||||
|
113,81.0,0
|
||||||
|
114,74.0,0
|
||||||
|
115,153.0,0
|
||||||
|
116,113.0,0
|
||||||
|
117,88.0,0
|
||||||
|
118,138.0,0
|
||||||
|
119,200.0,0
|
||||||
|
120,84.0,0
|
||||||
|
121,123.0,0
|
||||||
|
122,158.0,0
|
||||||
|
123,171.0,0
|
||||||
|
124,137.0,0
|
||||||
|
125,143.0,0
|
||||||
|
126,170.0,0
|
||||||
|
127,127.0,0
|
||||||
|
128,118.0,0
|
||||||
|
129,200.0,0
|
||||||
|
130,189.0,0
|
||||||
|
131,149.0,0
|
||||||
|
132,137.0,0
|
||||||
|
133,115.0,0
|
||||||
|
134,153.0,0
|
||||||
|
135,136.0,0
|
||||||
|
136,140.0,0
|
||||||
|
137,169.0,0
|
||||||
|
138,187.0,0
|
||||||
|
139,200.0,0
|
||||||
|
140,196.0,0
|
||||||
|
141,200.0,0
|
||||||
|
142,200.0,0
|
||||||
|
143,137.0,0
|
||||||
|
144,200.0,0
|
||||||
|
145,185.0,0
|
||||||
|
146,200.0,0
|
||||||
|
147,164.0,0
|
||||||
|
148,200.0,0
|
||||||
|
149,143.0,0
|
||||||
|
150,143.0,0
|
||||||
|
151,112.0,0
|
||||||
|
152,192.0,0
|
||||||
|
153,200.0,0
|
||||||
|
154,144.0,0
|
||||||
|
155,188.0,0
|
||||||
|
156,200.0,0
|
||||||
|
157,133.0,0
|
||||||
|
158,200.0,0
|
||||||
|
159,143.0,0
|
||||||
|
160,158.0,0
|
||||||
|
161,161.0,0
|
||||||
|
162,169.0,0
|
||||||
|
163,176.0,0
|
||||||
|
164,200.0,0
|
||||||
|
165,149.0,0
|
||||||
|
166,156.0,0
|
||||||
|
167,200.0,0
|
||||||
|
168,200.0,0
|
||||||
|
169,200.0,0
|
||||||
|
170,134.0,0
|
||||||
|
171,171.0,0
|
||||||
|
172,200.0,0
|
||||||
|
173,200.0,0
|
||||||
|
174,200.0,0
|
||||||
|
175,194.0,0
|
||||||
|
176,200.0,0
|
||||||
|
177,138.0,0
|
||||||
|
178,159.0,0
|
||||||
|
179,187.0,0
|
||||||
|
180,200.0,0
|
||||||
|
181,192.0,0
|
||||||
|
182,200.0,0
|
||||||
|
183,200.0,0
|
||||||
|
184,200.0,0
|
||||||
|
185,173.0,0
|
||||||
|
186,200.0,0
|
||||||
|
187,178.0,0
|
||||||
|
188,176.0,0
|
||||||
|
189,196.0,0
|
||||||
|
190,200.0,0
|
||||||
|
191,195.0,0
|
||||||
|
192,158.0,0
|
||||||
|
193,156.0,0
|
||||||
|
194,200.0,0
|
||||||
|
195,200.0,0
|
||||||
|
196,200.0,0
|
||||||
|
197,200.0,0
|
||||||
|
198,193.0,0
|
||||||
|
199,200.0,0
|
||||||
|
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/models/", "n_states": 4, "n_actions": 2}
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
@@ -0,0 +1,21 @@
|
|||||||
|
episodes,rewards,steps
|
||||||
|
0,200.0,0
|
||||||
|
1,200.0,0
|
||||||
|
2,200.0,0
|
||||||
|
3,200.0,0
|
||||||
|
4,191.0,0
|
||||||
|
5,200.0,0
|
||||||
|
6,200.0,0
|
||||||
|
7,179.0,0
|
||||||
|
8,200.0,0
|
||||||
|
9,200.0,0
|
||||||
|
10,200.0,0
|
||||||
|
11,190.0,0
|
||||||
|
12,147.0,0
|
||||||
|
13,197.0,0
|
||||||
|
14,200.0,0
|
||||||
|
15,200.0,0
|
||||||
|
16,167.0,0
|
||||||
|
17,200.0,0
|
||||||
|
18,200.0,0
|
||||||
|
19,200.0,0
|
||||||
|
Binary file not shown.
|
After Width: | Height: | Size: 65 KiB |
@@ -0,0 +1,201 @@
|
|||||||
|
episodes,rewards,steps
|
||||||
|
0,19.0,0
|
||||||
|
1,16.0,0
|
||||||
|
2,17.0,0
|
||||||
|
3,11.0,0
|
||||||
|
4,10.0,0
|
||||||
|
5,27.0,0
|
||||||
|
6,55.0,0
|
||||||
|
7,17.0,0
|
||||||
|
8,23.0,0
|
||||||
|
9,9.0,0
|
||||||
|
10,17.0,0
|
||||||
|
11,14.0,0
|
||||||
|
12,17.0,0
|
||||||
|
13,12.0,0
|
||||||
|
14,14.0,0
|
||||||
|
15,16.0,0
|
||||||
|
16,27.0,0
|
||||||
|
17,36.0,0
|
||||||
|
18,17.0,0
|
||||||
|
19,17.0,0
|
||||||
|
20,21.0,0
|
||||||
|
21,23.0,0
|
||||||
|
22,13.0,0
|
||||||
|
23,12.0,0
|
||||||
|
24,17.0,0
|
||||||
|
25,26.0,0
|
||||||
|
26,25.0,0
|
||||||
|
27,17.0,0
|
||||||
|
28,10.0,0
|
||||||
|
29,16.0,0
|
||||||
|
30,14.0,0
|
||||||
|
31,19.0,0
|
||||||
|
32,23.0,0
|
||||||
|
33,37.0,0
|
||||||
|
34,29.0,0
|
||||||
|
35,22.0,0
|
||||||
|
36,29.0,0
|
||||||
|
37,15.0,0
|
||||||
|
38,16.0,0
|
||||||
|
39,18.0,0
|
||||||
|
40,23.0,0
|
||||||
|
41,16.0,0
|
||||||
|
42,26.0,0
|
||||||
|
43,13.0,0
|
||||||
|
44,24.0,0
|
||||||
|
45,39.0,0
|
||||||
|
46,23.0,0
|
||||||
|
47,32.0,0
|
||||||
|
48,123.0,0
|
||||||
|
49,18.0,0
|
||||||
|
50,39.0,0
|
||||||
|
51,17.0,0
|
||||||
|
52,28.0,0
|
||||||
|
53,34.0,0
|
||||||
|
54,26.0,0
|
||||||
|
55,61.0,0
|
||||||
|
56,28.0,0
|
||||||
|
57,16.0,0
|
||||||
|
58,45.0,0
|
||||||
|
59,41.0,0
|
||||||
|
60,49.0,0
|
||||||
|
61,18.0,0
|
||||||
|
62,40.0,0
|
||||||
|
63,24.0,0
|
||||||
|
64,37.0,0
|
||||||
|
65,26.0,0
|
||||||
|
66,51.0,0
|
||||||
|
67,17.0,0
|
||||||
|
68,152.0,0
|
||||||
|
69,17.0,0
|
||||||
|
70,29.0,0
|
||||||
|
71,37.0,0
|
||||||
|
72,15.0,0
|
||||||
|
73,55.0,0
|
||||||
|
74,152.0,0
|
||||||
|
75,23.0,0
|
||||||
|
76,45.0,0
|
||||||
|
77,30.0,0
|
||||||
|
78,39.0,0
|
||||||
|
79,20.0,0
|
||||||
|
80,53.0,0
|
||||||
|
81,49.0,0
|
||||||
|
82,71.0,0
|
||||||
|
83,115.0,0
|
||||||
|
84,41.0,0
|
||||||
|
85,52.0,0
|
||||||
|
86,52.0,0
|
||||||
|
87,36.0,0
|
||||||
|
88,84.0,0
|
||||||
|
89,122.0,0
|
||||||
|
90,49.0,0
|
||||||
|
91,200.0,0
|
||||||
|
92,67.0,0
|
||||||
|
93,87.0,0
|
||||||
|
94,183.0,0
|
||||||
|
95,132.0,0
|
||||||
|
96,76.0,0
|
||||||
|
97,200.0,0
|
||||||
|
98,200.0,0
|
||||||
|
99,200.0,0
|
||||||
|
100,200.0,0
|
||||||
|
101,200.0,0
|
||||||
|
102,106.0,0
|
||||||
|
103,192.0,0
|
||||||
|
104,111.0,0
|
||||||
|
105,95.0,0
|
||||||
|
106,200.0,0
|
||||||
|
107,200.0,0
|
||||||
|
108,148.0,0
|
||||||
|
109,200.0,0
|
||||||
|
110,97.0,0
|
||||||
|
111,200.0,0
|
||||||
|
112,200.0,0
|
||||||
|
113,105.0,0
|
||||||
|
114,135.0,0
|
||||||
|
115,200.0,0
|
||||||
|
116,144.0,0
|
||||||
|
117,156.0,0
|
||||||
|
118,200.0,0
|
||||||
|
119,200.0,0
|
||||||
|
120,166.0,0
|
||||||
|
121,200.0,0
|
||||||
|
122,200.0,0
|
||||||
|
123,200.0,0
|
||||||
|
124,200.0,0
|
||||||
|
125,200.0,0
|
||||||
|
126,200.0,0
|
||||||
|
127,158.0,0
|
||||||
|
128,139.0,0
|
||||||
|
129,200.0,0
|
||||||
|
130,200.0,0
|
||||||
|
131,200.0,0
|
||||||
|
132,200.0,0
|
||||||
|
133,122.0,0
|
||||||
|
134,200.0,0
|
||||||
|
135,188.0,0
|
||||||
|
136,200.0,0
|
||||||
|
137,183.0,0
|
||||||
|
138,200.0,0
|
||||||
|
139,200.0,0
|
||||||
|
140,200.0,0
|
||||||
|
141,200.0,0
|
||||||
|
142,200.0,0
|
||||||
|
143,158.0,0
|
||||||
|
144,200.0,0
|
||||||
|
145,200.0,0
|
||||||
|
146,200.0,0
|
||||||
|
147,191.0,0
|
||||||
|
148,200.0,0
|
||||||
|
149,194.0,0
|
||||||
|
150,178.0,0
|
||||||
|
151,200.0,0
|
||||||
|
152,200.0,0
|
||||||
|
153,200.0,0
|
||||||
|
154,162.0,0
|
||||||
|
155,200.0,0
|
||||||
|
156,200.0,0
|
||||||
|
157,128.0,0
|
||||||
|
158,200.0,0
|
||||||
|
159,184.0,0
|
||||||
|
160,194.0,0
|
||||||
|
161,200.0,0
|
||||||
|
162,200.0,0
|
||||||
|
163,200.0,0
|
||||||
|
164,200.0,0
|
||||||
|
165,160.0,0
|
||||||
|
166,163.0,0
|
||||||
|
167,200.0,0
|
||||||
|
168,200.0,0
|
||||||
|
169,200.0,0
|
||||||
|
170,141.0,0
|
||||||
|
171,200.0,0
|
||||||
|
172,200.0,0
|
||||||
|
173,200.0,0
|
||||||
|
174,200.0,0
|
||||||
|
175,200.0,0
|
||||||
|
176,200.0,0
|
||||||
|
177,157.0,0
|
||||||
|
178,164.0,0
|
||||||
|
179,200.0,0
|
||||||
|
180,200.0,0
|
||||||
|
181,200.0,0
|
||||||
|
182,200.0,0
|
||||||
|
183,200.0,0
|
||||||
|
184,200.0,0
|
||||||
|
185,193.0,0
|
||||||
|
186,182.0,0
|
||||||
|
187,200.0,0
|
||||||
|
188,200.0,0
|
||||||
|
189,200.0,0
|
||||||
|
190,200.0,0
|
||||||
|
191,200.0,0
|
||||||
|
192,174.0,0
|
||||||
|
193,178.0,0
|
||||||
|
194,200.0,0
|
||||||
|
195,200.0,0
|
||||||
|
196,200.0,0
|
||||||
|
197,200.0,0
|
||||||
|
198,200.0,0
|
||||||
|
199,200.0,0
|
||||||
|
@@ -1,125 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding=utf-8
|
|
||||||
'''
|
|
||||||
Author: JiangJi
|
|
||||||
Email: johnjim0816@gmail.com
|
|
||||||
Date: 2021-11-07 18:10:37
|
|
||||||
LastEditor: JiangJi
|
|
||||||
LastEditTime: 2022-07-21 21:52:31
|
|
||||||
Discription:
|
|
||||||
'''
|
|
||||||
import sys,os
|
|
||||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
|
||||||
parent_path = os.path.dirname(curr_path) # parent path
|
|
||||||
sys.path.append(parent_path) # add to system path
|
|
||||||
|
|
||||||
import gym
|
|
||||||
import torch
|
|
||||||
import datetime
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from common.utils import save_results,make_dir
|
|
||||||
from common.utils import plot_rewards,save_args
|
|
||||||
from common.models import MLP
|
|
||||||
from common.memories import ReplayBuffer
|
|
||||||
from DoubleDQN.double_dqn import DoubleDQN
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
""" 超参数
|
|
||||||
"""
|
|
||||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
|
||||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
|
||||||
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
|
||||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
|
||||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
|
||||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
|
||||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
|
||||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
|
||||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
|
||||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
|
||||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
|
||||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
|
||||||
parser.add_argument('--batch_size',default=64,type=int)
|
|
||||||
parser.add_argument('--target_update',default=4,type=int)
|
|
||||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
|
||||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
|
||||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
|
||||||
'/' + curr_time + '/results/' )
|
|
||||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
|
||||||
'/' + curr_time + '/models/' ) # 保存模型的路径
|
|
||||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def env_agent_config(cfg,seed=1):
|
|
||||||
env = gym.make(cfg.env_name)
|
|
||||||
env.seed(seed)
|
|
||||||
n_states = env.observation_space.shape[0]
|
|
||||||
n_actions = env.action_space.n
|
|
||||||
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
|
|
||||||
memory = ReplayBuffer(cfg.memory_capacity)
|
|
||||||
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
|
|
||||||
return env,agent
|
|
||||||
|
|
||||||
def train(cfg,env,agent):
|
|
||||||
print("开始训练!")
|
|
||||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
|
||||||
rewards = [] # 记录所有回合的奖励
|
|
||||||
for i_ep in range(cfg.train_eps):
|
|
||||||
ep_reward = 0 # 记录一回合内的奖励
|
|
||||||
state = env.reset() # 重置环境,返回初始状态
|
|
||||||
while True:
|
|
||||||
action = agent.sample(state)
|
|
||||||
next_state, reward, done, _ = env.step(action)
|
|
||||||
ep_reward += reward
|
|
||||||
agent.memory.push(state, action, reward, next_state, done)
|
|
||||||
state = next_state
|
|
||||||
agent.update()
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
if i_ep % cfg.target_update == 0:
|
|
||||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
|
||||||
if (i_ep+1)%10 == 0:
|
|
||||||
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f},Epislon:{agent.epsilon:.3f}')
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
print("完成训练!")
|
|
||||||
return {'rewards':rewards}
|
|
||||||
|
|
||||||
def test(cfg,env,agent):
|
|
||||||
print("开始测试!")
|
|
||||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
|
||||||
rewards = [] # 记录所有回合的奖励
|
|
||||||
for i_ep in range(cfg.test_eps):
|
|
||||||
state = env.reset()
|
|
||||||
ep_reward = 0
|
|
||||||
while True:
|
|
||||||
action = agent.predict(state)
|
|
||||||
next_state, reward, done, _ = env.step(action)
|
|
||||||
state = next_state
|
|
||||||
ep_reward += reward
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
|
|
||||||
print("完成测试!")
|
|
||||||
return {'rewards':rewards}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cfg = get_args()
|
|
||||||
# 训练
|
|
||||||
env, agent = env_agent_config(cfg,seed=1)
|
|
||||||
res_dic = train(cfg, env, agent)
|
|
||||||
make_dir(cfg.result_path, cfg.model_path)
|
|
||||||
save_args(cfg) # 保存参数
|
|
||||||
agent.save(path=cfg.model_path) # 保存模型
|
|
||||||
save_results(res_dic, tag='train',
|
|
||||||
path=cfg.result_path)
|
|
||||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
|
||||||
# 测试
|
|
||||||
env, agent = env_agent_config(cfg,seed=1)
|
|
||||||
agent.load(path=cfg.model_path) # 导入模型
|
|
||||||
res_dic = test(cfg, env, agent)
|
|
||||||
save_results(res_dic, tag='test',
|
|
||||||
path=cfg.result_path) # 保存结果
|
|
||||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
|
||||||
15
projects/codes/scripts/DoubleDQN_CartPole-v0.sh
Normal file
15
projects/codes/scripts/DoubleDQN_CartPole-v0.sh
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# run Double DQN on CartPole-v0
|
||||||
|
# source conda, if you are already in proper conda environment, then comment the codes util "conda activate easyrl"
|
||||||
|
|
||||||
|
if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then
|
||||||
|
echo "source file at ~/anaconda3/etc/profile.d/conda.sh"
|
||||||
|
source ~/anaconda3/etc/profile.d/conda.sh
|
||||||
|
elif [ -f "$HOME/opt/anaconda3/etc/profile.d/conda.sh" ]; then
|
||||||
|
echo "source file at ~/opt/anaconda3/etc/profile.d/conda.sh"
|
||||||
|
source ~/opt/anaconda3/etc/profile.d/conda.sh
|
||||||
|
else
|
||||||
|
echo 'please manually config the conda source path'
|
||||||
|
fi
|
||||||
|
conda activate easyrl # easyrl here can be changed to another name of conda env that you have created
|
||||||
|
codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path
|
||||||
|
python $codes_dir/DoubleDQN/main.py --device cuda
|
||||||
Reference in New Issue
Block a user