hot update Double DQN

This commit is contained in:
johnjim0816
2022-08-30 16:29:57 +08:00
parent 0b0f7e857d
commit 764ba63d40
26 changed files with 803 additions and 365 deletions

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2022-08-23 23:59:54
LastEditTime: 2022-08-29 23:30:08
@Discription:
@Environment: python 3.7.7
'''
@@ -78,7 +78,7 @@ class DQN:
self.batch_size)
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
# print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape)
@@ -91,7 +91,7 @@ class DQN:
# compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
# print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad)
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
# backpropagation
self.optimizer.zero_grad()
loss.backward()

View File

@@ -9,130 +9,122 @@ import torch
import datetime
import numpy as np
import argparse
from common.utils import save_results,all_seed
from common.utils import plot_rewards,save_args
from common.utils import all_seed
from common.models import MLP
from common.memories import ReplayBuffer
from common.launcher import Launcher
from envs.register import register_env
from dqn import DQN
class Main(Launcher):
def get_args(self):
""" hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
# please manually change the following args in this script if you want
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models' )
args = parser.parse_args()
args = {**vars(args)} # type(dict)
return args
def get_args():
""" hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
# please manually change the following args in this script if you want
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models' )
args = parser.parse_args()
args = {**vars(args)} # type(dict)
return args
def env_agent_config(cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
agent = DQN(model,memory,cfg) # create agent
return env, agent
def env_agent_config(cfg):
''' create env and agent
'''
env = gym.make(cfg['env_name']) # create env
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
n_states = env.observation_space.shape[0] # state dimension
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
agent = DQN(model,memory,cfg) # create agent
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg["train_eps"]):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
ep_step += 1
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.memory.push(state, action, reward,
next_state, done) # save transitions
state = next_state # update next state for env
agent.update() # update agent
ep_reward += reward #
if done:
break
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!")
env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
return res_dic
def train(cfg, env, agent):
''' 训练
'''
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg["train_eps"]):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
ep_step += 1
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.memory.push(state, action, reward,
next_state, done) # save transitions
state = next_state # update next state for env
agent.update() # update agent
ep_reward += reward #
if done:
break
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!")
env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
return res_dic
def test(cfg, env, agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
ep_step+=1
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(cfg, env, agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
ep_step+=1
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
cfg = get_args()
# training
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg['result_path']) # save parameters
agent.save_model(path = cfg['model_path']) # save models
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg['model_path']) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg['result_path'])
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
main = Main()
main.run()

View File

@@ -1 +1,21 @@
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 10, "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", "show_fig": false, "save_fig": true}
{
"algo_name": "DQN",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.95,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 500,
"lr": 0.0001,
"memory_capacity": 100000,
"batch_size": 64,
"target_update": 4,
"hidden_dim": 256,
"device": "cpu",
"seed": 10,
"result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results",
"model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models",
"show_fig": false,
"save_fig": true
}

View File

@@ -1 +1,24 @@
{"algo_name": "DQN", "env_name": "CartPole-v1", "train_eps": 2000, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 6000, "lr": 1e-05, "memory_capacity": 200000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models", "n_states": 4, "n_actions": 2}
{
"algo_name": "DQN",
"env_name": "CartPole-v1",
"train_eps": 2000,
"test_eps": 20,
"ep_max_steps": 100000,
"gamma": 0.99,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 6000,
"lr": 1e-05,
"memory_capacity": 200000,
"batch_size": 64,
"target_update": 4,
"hidden_dim": 256,
"device": "cuda",
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models",
"n_states": 4,
"n_actions": 2
}

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2022-07-21 00:08:26
LastEditTime: 2022-08-29 23:34:20
@Discription:
@Environment: python 3.7.7
'''
@@ -20,148 +20,87 @@ import torch.nn.functional as F
import random
import math
import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class DoubleDQN:
def __init__(self, n_states, n_actions, model, memory, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = torch.device(cfg.device) # 设备cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = model.to(self.device)
self.target_net = model.to(self.device)
def __init__(self,models, memories, cfg):
self.n_actions = cfg['n_actions']
self.device = torch.device(cfg['device'])
self.gamma = cfg['gamma']
## e-greedy parameters
self.sample_count = 0 # sample count for epsilon decay
self.epsilon_start = cfg['epsilon_start']
self.epsilon_end = cfg['epsilon_end']
self.epsilon_decay = cfg['epsilon_decay']
self.batch_size = cfg['batch_size']
self.policy_net = models['Qnet'].to(self.device)
self.target_net = models['Qnet'].to(self.device)
# target_net copy from policy_net
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(param.data)
# self.target_net.eval() # 不启用 BatchNormalization Dropout
# 可查parameters()state_dict()的区别,前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0
self.memory = memory
# self.target_net.eval() # donnot use BatchNormalization or Dropout
# the difference between parameters() and state_dict() is that parameters() require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr'])
self.memory = memories['Memory']
self.update_flag = False
def sample(self, state):
'''选择动作
def sample_action(self, state):
''' sample action
'''
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
def predict(self, state):
'''选择动作
def predict_action(self, state):
''' predict action
'''
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
return action
def update(self):
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
return
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
else:
if not self.update_flag:
print("Begin to update!")
self.update_flag = True
# sample a batch of transitions from replay buffer
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
# convert to tensor
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
1) # 例如tensor([[1],...,[0]])
reward_batch = torch.tensor(
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device) # 将bool转为float然后转为张量
# 计算当前(s_t,a)对应的Q(s_t, a)
q_values = self.policy_net(state_batch)
next_q_values = self.policy_net(next_state_batch)
# 代入当前选择的action得到Q(s_t|a=a_t)
q_value = q_values.gather(dim=1, index=action_batch)
'''以下是Nature DQN的q_target计算方式
# 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 q_target
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQN q_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
# 选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
# 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
self.loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
# compute current Q(s_t|a=a_t)
q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
next_q_value_batch = self.policy_net(next_state_batch)
'''the following is the way of computing Double DQN expected_q_valuea bit different from Nature DQN'''
next_target_value_batch = self.target_net(next_state_batch)
# choose action a from Q(s_t, a), next_target_values obtain next_q_valuewhich is Q(s_t|a=argmax Q(s_t, a))
next_target_q_value_batch = next_target_value_batch.gather(1, torch.max(next_q_value_batch, 1)[1].unsqueeze(1)) # shape(batchsize,1)
expected_q_value_batch = reward_batch + self.gamma * next_target_q_value_batch * (1-done_batch)
loss = nn.MSELoss()(q_value_batch , expected_q_value_batch)
self.optimizer.zero_grad()
loss.backward()
# clip to avoid gradient explosion
for param in self.policy_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
self.optimizer.step()
def save(self,path):
def save_model(self,path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
def load(self,path):
def load_model(self,path):
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-11-07 18:10:37
LastEditor: JiangJi
LastEditTime: 2022-08-29 23:33:31
Discription:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import datetime
import argparse
from common.utils import all_seed
from common.models import MLP
from common.memories import ReplayBufferQue
from DoubleDQN.double_dqn import DoubleDQN
from common.launcher import Launcher
from envs.register import register_env
class Main(Launcher):
def get_args(self):
''' hyperparameters
'''
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'Qnet':MLP(n_states,n_actions,hidden_dim=cfg['hidden_dim'])}
memories = {'Memory':ReplayBufferQue(cfg['memory_capacity'])}
agent = DoubleDQN(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg["train_eps"]):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action = agent.sample_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push((state, action, reward, next_state, done))
state = next_state
agent.update()
if done:
break
if i_ep % cfg['target_update'] == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!")
env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
return res_dic
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -1 +0,0 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,145.0,0
1,166.0,0
2,171.0,0
3,200.0,0
4,139.0,0
5,200.0,0
6,200.0,0
7,141.0,0
8,200.0,0
9,187.0,0
10,166.0,0
11,172.0,0
12,121.0,0
13,200.0,0
14,200.0,0
15,149.0,0
16,128.0,0
17,200.0,0
18,178.0,0
19,185.0,0
1 episodes rewards steps
2 0 145.0 0
3 1 166.0 0
4 2 171.0 0
5 3 200.0 0
6 4 139.0 0
7 5 200.0 0
8 6 200.0 0
9 7 141.0 0
10 8 200.0 0
11 9 187.0 0
12 10 166.0 0
13 11 172.0 0
14 12 121.0 0
15 13 200.0 0
16 14 200.0 0
17 15 149.0 0
18 16 128.0 0
19 17 200.0 0
20 18 178.0 0
21 19 185.0 0

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards,steps
0,19.0,0
1,16.0,0
2,17.0,0
3,11.0,0
4,10.0,0
5,27.0,0
6,16.0,0
7,9.0,0
8,20.0,0
9,21.0,0
10,15.0,0
11,10.0,0
12,14.0,0
13,37.0,0
14,12.0,0
15,10.0,0
16,27.0,0
17,33.0,0
18,19.0,0
19,13.0,0
20,26.0,0
21,15.0,0
22,29.0,0
23,11.0,0
24,20.0,0
25,23.0,0
26,23.0,0
27,26.0,0
28,17.0,0
29,33.0,0
30,16.0,0
31,48.0,0
32,48.0,0
33,69.0,0
34,58.0,0
35,24.0,0
36,18.0,0
37,28.0,0
38,12.0,0
39,12.0,0
40,18.0,0
41,12.0,0
42,13.0,0
43,21.0,0
44,30.0,0
45,32.0,0
46,22.0,0
47,18.0,0
48,12.0,0
49,12.0,0
50,20.0,0
51,32.0,0
52,15.0,0
53,100.0,0
54,26.0,0
55,25.0,0
56,18.0,0
57,15.0,0
58,35.0,0
59,12.0,0
60,65.0,0
61,27.0,0
62,29.0,0
63,22.0,0
64,83.0,0
65,24.0,0
66,28.0,0
67,15.0,0
68,43.0,0
69,13.0,0
70,22.0,0
71,46.0,0
72,14.0,0
73,32.0,0
74,44.0,0
75,53.0,0
76,31.0,0
77,51.0,0
78,61.0,0
79,30.0,0
80,36.0,0
81,30.0,0
82,48.0,0
83,26.0,0
84,27.0,0
85,43.0,0
86,20.0,0
87,87.0,0
88,71.0,0
89,43.0,0
90,57.0,0
91,40.0,0
92,37.0,0
93,43.0,0
94,31.0,0
95,45.0,0
96,47.0,0
97,52.0,0
98,48.0,0
99,98.0,0
100,49.0,0
101,98.0,0
102,68.0,0
103,70.0,0
104,74.0,0
105,73.0,0
106,127.0,0
107,92.0,0
108,70.0,0
109,97.0,0
110,66.0,0
111,112.0,0
112,138.0,0
113,81.0,0
114,74.0,0
115,153.0,0
116,113.0,0
117,88.0,0
118,138.0,0
119,200.0,0
120,84.0,0
121,123.0,0
122,158.0,0
123,171.0,0
124,137.0,0
125,143.0,0
126,170.0,0
127,127.0,0
128,118.0,0
129,200.0,0
130,189.0,0
131,149.0,0
132,137.0,0
133,115.0,0
134,153.0,0
135,136.0,0
136,140.0,0
137,169.0,0
138,187.0,0
139,200.0,0
140,196.0,0
141,200.0,0
142,200.0,0
143,137.0,0
144,200.0,0
145,185.0,0
146,200.0,0
147,164.0,0
148,200.0,0
149,143.0,0
150,143.0,0
151,112.0,0
152,192.0,0
153,200.0,0
154,144.0,0
155,188.0,0
156,200.0,0
157,133.0,0
158,200.0,0
159,143.0,0
160,158.0,0
161,161.0,0
162,169.0,0
163,176.0,0
164,200.0,0
165,149.0,0
166,156.0,0
167,200.0,0
168,200.0,0
169,200.0,0
170,134.0,0
171,171.0,0
172,200.0,0
173,200.0,0
174,200.0,0
175,194.0,0
176,200.0,0
177,138.0,0
178,159.0,0
179,187.0,0
180,200.0,0
181,192.0,0
182,200.0,0
183,200.0,0
184,200.0,0
185,173.0,0
186,200.0,0
187,178.0,0
188,176.0,0
189,196.0,0
190,200.0,0
191,195.0,0
192,158.0,0
193,156.0,0
194,200.0,0
195,200.0,0
196,200.0,0
197,200.0,0
198,193.0,0
199,200.0,0
1 episodes rewards steps
2 0 19.0 0
3 1 16.0 0
4 2 17.0 0
5 3 11.0 0
6 4 10.0 0
7 5 27.0 0
8 6 16.0 0
9 7 9.0 0
10 8 20.0 0
11 9 21.0 0
12 10 15.0 0
13 11 10.0 0
14 12 14.0 0
15 13 37.0 0
16 14 12.0 0
17 15 10.0 0
18 16 27.0 0
19 17 33.0 0
20 18 19.0 0
21 19 13.0 0
22 20 26.0 0
23 21 15.0 0
24 22 29.0 0
25 23 11.0 0
26 24 20.0 0
27 25 23.0 0
28 26 23.0 0
29 27 26.0 0
30 28 17.0 0
31 29 33.0 0
32 30 16.0 0
33 31 48.0 0
34 32 48.0 0
35 33 69.0 0
36 34 58.0 0
37 35 24.0 0
38 36 18.0 0
39 37 28.0 0
40 38 12.0 0
41 39 12.0 0
42 40 18.0 0
43 41 12.0 0
44 42 13.0 0
45 43 21.0 0
46 44 30.0 0
47 45 32.0 0
48 46 22.0 0
49 47 18.0 0
50 48 12.0 0
51 49 12.0 0
52 50 20.0 0
53 51 32.0 0
54 52 15.0 0
55 53 100.0 0
56 54 26.0 0
57 55 25.0 0
58 56 18.0 0
59 57 15.0 0
60 58 35.0 0
61 59 12.0 0
62 60 65.0 0
63 61 27.0 0
64 62 29.0 0
65 63 22.0 0
66 64 83.0 0
67 65 24.0 0
68 66 28.0 0
69 67 15.0 0
70 68 43.0 0
71 69 13.0 0
72 70 22.0 0
73 71 46.0 0
74 72 14.0 0
75 73 32.0 0
76 74 44.0 0
77 75 53.0 0
78 76 31.0 0
79 77 51.0 0
80 78 61.0 0
81 79 30.0 0
82 80 36.0 0
83 81 30.0 0
84 82 48.0 0
85 83 26.0 0
86 84 27.0 0
87 85 43.0 0
88 86 20.0 0
89 87 87.0 0
90 88 71.0 0
91 89 43.0 0
92 90 57.0 0
93 91 40.0 0
94 92 37.0 0
95 93 43.0 0
96 94 31.0 0
97 95 45.0 0
98 96 47.0 0
99 97 52.0 0
100 98 48.0 0
101 99 98.0 0
102 100 49.0 0
103 101 98.0 0
104 102 68.0 0
105 103 70.0 0
106 104 74.0 0
107 105 73.0 0
108 106 127.0 0
109 107 92.0 0
110 108 70.0 0
111 109 97.0 0
112 110 66.0 0
113 111 112.0 0
114 112 138.0 0
115 113 81.0 0
116 114 74.0 0
117 115 153.0 0
118 116 113.0 0
119 117 88.0 0
120 118 138.0 0
121 119 200.0 0
122 120 84.0 0
123 121 123.0 0
124 122 158.0 0
125 123 171.0 0
126 124 137.0 0
127 125 143.0 0
128 126 170.0 0
129 127 127.0 0
130 128 118.0 0
131 129 200.0 0
132 130 189.0 0
133 131 149.0 0
134 132 137.0 0
135 133 115.0 0
136 134 153.0 0
137 135 136.0 0
138 136 140.0 0
139 137 169.0 0
140 138 187.0 0
141 139 200.0 0
142 140 196.0 0
143 141 200.0 0
144 142 200.0 0
145 143 137.0 0
146 144 200.0 0
147 145 185.0 0
148 146 200.0 0
149 147 164.0 0
150 148 200.0 0
151 149 143.0 0
152 150 143.0 0
153 151 112.0 0
154 152 192.0 0
155 153 200.0 0
156 154 144.0 0
157 155 188.0 0
158 156 200.0 0
159 157 133.0 0
160 158 200.0 0
161 159 143.0 0
162 160 158.0 0
163 161 161.0 0
164 162 169.0 0
165 163 176.0 0
166 164 200.0 0
167 165 149.0 0
168 166 156.0 0
169 167 200.0 0
170 168 200.0 0
171 169 200.0 0
172 170 134.0 0
173 171 171.0 0
174 172 200.0 0
175 173 200.0 0
176 174 200.0 0
177 175 194.0 0
178 176 200.0 0
179 177 138.0 0
180 178 159.0 0
181 179 187.0 0
182 180 200.0 0
183 181 192.0 0
184 182 200.0 0
185 183 200.0 0
186 184 200.0 0
187 185 173.0 0
188 186 200.0 0
189 187 178.0 0
190 188 176.0 0
191 189 196.0 0
192 190 200.0 0
193 191 195.0 0
194 192 158.0 0
195 193 156.0 0
196 194 200.0 0
197 195 200.0 0
198 196 200.0 0
199 197 200.0 0
200 198 193.0 0
201 199 200.0 0

View File

@@ -0,0 +1 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,200.0,0
1,200.0,0
2,200.0,0
3,200.0,0
4,191.0,0
5,200.0,0
6,200.0,0
7,179.0,0
8,200.0,0
9,200.0,0
10,200.0,0
11,190.0,0
12,147.0,0
13,197.0,0
14,200.0,0
15,200.0,0
16,167.0,0
17,200.0,0
18,200.0,0
19,200.0,0
1 episodes rewards steps
2 0 200.0 0
3 1 200.0 0
4 2 200.0 0
5 3 200.0 0
6 4 191.0 0
7 5 200.0 0
8 6 200.0 0
9 7 179.0 0
10 8 200.0 0
11 9 200.0 0
12 10 200.0 0
13 11 190.0 0
14 12 147.0 0
15 13 197.0 0
16 14 200.0 0
17 15 200.0 0
18 16 167.0 0
19 17 200.0 0
20 18 200.0 0
21 19 200.0 0

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards,steps
0,19.0,0
1,16.0,0
2,17.0,0
3,11.0,0
4,10.0,0
5,27.0,0
6,55.0,0
7,17.0,0
8,23.0,0
9,9.0,0
10,17.0,0
11,14.0,0
12,17.0,0
13,12.0,0
14,14.0,0
15,16.0,0
16,27.0,0
17,36.0,0
18,17.0,0
19,17.0,0
20,21.0,0
21,23.0,0
22,13.0,0
23,12.0,0
24,17.0,0
25,26.0,0
26,25.0,0
27,17.0,0
28,10.0,0
29,16.0,0
30,14.0,0
31,19.0,0
32,23.0,0
33,37.0,0
34,29.0,0
35,22.0,0
36,29.0,0
37,15.0,0
38,16.0,0
39,18.0,0
40,23.0,0
41,16.0,0
42,26.0,0
43,13.0,0
44,24.0,0
45,39.0,0
46,23.0,0
47,32.0,0
48,123.0,0
49,18.0,0
50,39.0,0
51,17.0,0
52,28.0,0
53,34.0,0
54,26.0,0
55,61.0,0
56,28.0,0
57,16.0,0
58,45.0,0
59,41.0,0
60,49.0,0
61,18.0,0
62,40.0,0
63,24.0,0
64,37.0,0
65,26.0,0
66,51.0,0
67,17.0,0
68,152.0,0
69,17.0,0
70,29.0,0
71,37.0,0
72,15.0,0
73,55.0,0
74,152.0,0
75,23.0,0
76,45.0,0
77,30.0,0
78,39.0,0
79,20.0,0
80,53.0,0
81,49.0,0
82,71.0,0
83,115.0,0
84,41.0,0
85,52.0,0
86,52.0,0
87,36.0,0
88,84.0,0
89,122.0,0
90,49.0,0
91,200.0,0
92,67.0,0
93,87.0,0
94,183.0,0
95,132.0,0
96,76.0,0
97,200.0,0
98,200.0,0
99,200.0,0
100,200.0,0
101,200.0,0
102,106.0,0
103,192.0,0
104,111.0,0
105,95.0,0
106,200.0,0
107,200.0,0
108,148.0,0
109,200.0,0
110,97.0,0
111,200.0,0
112,200.0,0
113,105.0,0
114,135.0,0
115,200.0,0
116,144.0,0
117,156.0,0
118,200.0,0
119,200.0,0
120,166.0,0
121,200.0,0
122,200.0,0
123,200.0,0
124,200.0,0
125,200.0,0
126,200.0,0
127,158.0,0
128,139.0,0
129,200.0,0
130,200.0,0
131,200.0,0
132,200.0,0
133,122.0,0
134,200.0,0
135,188.0,0
136,200.0,0
137,183.0,0
138,200.0,0
139,200.0,0
140,200.0,0
141,200.0,0
142,200.0,0
143,158.0,0
144,200.0,0
145,200.0,0
146,200.0,0
147,191.0,0
148,200.0,0
149,194.0,0
150,178.0,0
151,200.0,0
152,200.0,0
153,200.0,0
154,162.0,0
155,200.0,0
156,200.0,0
157,128.0,0
158,200.0,0
159,184.0,0
160,194.0,0
161,200.0,0
162,200.0,0
163,200.0,0
164,200.0,0
165,160.0,0
166,163.0,0
167,200.0,0
168,200.0,0
169,200.0,0
170,141.0,0
171,200.0,0
172,200.0,0
173,200.0,0
174,200.0,0
175,200.0,0
176,200.0,0
177,157.0,0
178,164.0,0
179,200.0,0
180,200.0,0
181,200.0,0
182,200.0,0
183,200.0,0
184,200.0,0
185,193.0,0
186,182.0,0
187,200.0,0
188,200.0,0
189,200.0,0
190,200.0,0
191,200.0,0
192,174.0,0
193,178.0,0
194,200.0,0
195,200.0,0
196,200.0,0
197,200.0,0
198,200.0,0
199,200.0,0
1 episodes rewards steps
2 0 19.0 0
3 1 16.0 0
4 2 17.0 0
5 3 11.0 0
6 4 10.0 0
7 5 27.0 0
8 6 55.0 0
9 7 17.0 0
10 8 23.0 0
11 9 9.0 0
12 10 17.0 0
13 11 14.0 0
14 12 17.0 0
15 13 12.0 0
16 14 14.0 0
17 15 16.0 0
18 16 27.0 0
19 17 36.0 0
20 18 17.0 0
21 19 17.0 0
22 20 21.0 0
23 21 23.0 0
24 22 13.0 0
25 23 12.0 0
26 24 17.0 0
27 25 26.0 0
28 26 25.0 0
29 27 17.0 0
30 28 10.0 0
31 29 16.0 0
32 30 14.0 0
33 31 19.0 0
34 32 23.0 0
35 33 37.0 0
36 34 29.0 0
37 35 22.0 0
38 36 29.0 0
39 37 15.0 0
40 38 16.0 0
41 39 18.0 0
42 40 23.0 0
43 41 16.0 0
44 42 26.0 0
45 43 13.0 0
46 44 24.0 0
47 45 39.0 0
48 46 23.0 0
49 47 32.0 0
50 48 123.0 0
51 49 18.0 0
52 50 39.0 0
53 51 17.0 0
54 52 28.0 0
55 53 34.0 0
56 54 26.0 0
57 55 61.0 0
58 56 28.0 0
59 57 16.0 0
60 58 45.0 0
61 59 41.0 0
62 60 49.0 0
63 61 18.0 0
64 62 40.0 0
65 63 24.0 0
66 64 37.0 0
67 65 26.0 0
68 66 51.0 0
69 67 17.0 0
70 68 152.0 0
71 69 17.0 0
72 70 29.0 0
73 71 37.0 0
74 72 15.0 0
75 73 55.0 0
76 74 152.0 0
77 75 23.0 0
78 76 45.0 0
79 77 30.0 0
80 78 39.0 0
81 79 20.0 0
82 80 53.0 0
83 81 49.0 0
84 82 71.0 0
85 83 115.0 0
86 84 41.0 0
87 85 52.0 0
88 86 52.0 0
89 87 36.0 0
90 88 84.0 0
91 89 122.0 0
92 90 49.0 0
93 91 200.0 0
94 92 67.0 0
95 93 87.0 0
96 94 183.0 0
97 95 132.0 0
98 96 76.0 0
99 97 200.0 0
100 98 200.0 0
101 99 200.0 0
102 100 200.0 0
103 101 200.0 0
104 102 106.0 0
105 103 192.0 0
106 104 111.0 0
107 105 95.0 0
108 106 200.0 0
109 107 200.0 0
110 108 148.0 0
111 109 200.0 0
112 110 97.0 0
113 111 200.0 0
114 112 200.0 0
115 113 105.0 0
116 114 135.0 0
117 115 200.0 0
118 116 144.0 0
119 117 156.0 0
120 118 200.0 0
121 119 200.0 0
122 120 166.0 0
123 121 200.0 0
124 122 200.0 0
125 123 200.0 0
126 124 200.0 0
127 125 200.0 0
128 126 200.0 0
129 127 158.0 0
130 128 139.0 0
131 129 200.0 0
132 130 200.0 0
133 131 200.0 0
134 132 200.0 0
135 133 122.0 0
136 134 200.0 0
137 135 188.0 0
138 136 200.0 0
139 137 183.0 0
140 138 200.0 0
141 139 200.0 0
142 140 200.0 0
143 141 200.0 0
144 142 200.0 0
145 143 158.0 0
146 144 200.0 0
147 145 200.0 0
148 146 200.0 0
149 147 191.0 0
150 148 200.0 0
151 149 194.0 0
152 150 178.0 0
153 151 200.0 0
154 152 200.0 0
155 153 200.0 0
156 154 162.0 0
157 155 200.0 0
158 156 200.0 0
159 157 128.0 0
160 158 200.0 0
161 159 184.0 0
162 160 194.0 0
163 161 200.0 0
164 162 200.0 0
165 163 200.0 0
166 164 200.0 0
167 165 160.0 0
168 166 163.0 0
169 167 200.0 0
170 168 200.0 0
171 169 200.0 0
172 170 141.0 0
173 171 200.0 0
174 172 200.0 0
175 173 200.0 0
176 174 200.0 0
177 175 200.0 0
178 176 200.0 0
179 177 157.0 0
180 178 164.0 0
181 179 200.0 0
182 180 200.0 0
183 181 200.0 0
184 182 200.0 0
185 183 200.0 0
186 184 200.0 0
187 185 193.0 0
188 186 182.0 0
189 187 200.0 0
190 188 200.0 0
191 189 200.0 0
192 190 200.0 0
193 191 200.0 0
194 192 174.0 0
195 193 178.0 0
196 194 200.0 0
197 195 200.0 0
198 196 200.0 0
199 197 200.0 0
200 198 200.0 0
201 199 200.0 0

View File

@@ -1,125 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-11-07 18:10:37
LastEditor: JiangJi
LastEditTime: 2022-07-21 21:52:31
Discription:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import torch
import datetime
import argparse
from common.utils import save_results,make_dir
from common.utils import plot_rewards,save_args
from common.models import MLP
from common.memories import ReplayBuffer
from DoubleDQN.double_dqn import DoubleDQN
def get_args():
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # 保存模型的路径
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
memory = ReplayBuffer(cfg.memory_capacity)
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
return env,agent
def train(cfg,env,agent):
print("开始训练!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.sample(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
state = next_state
agent.update()
if done:
break
if i_ep % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
if (i_ep+1)%10 == 0:
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}Epislon{agent.epsilon:.3f}')
rewards.append(ep_reward)
print("完成训练!")
return {'rewards':rewards}
def test(cfg,env,agent):
print("开始测试!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.predict(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
print("完成测试!")
return {'rewards':rewards}
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg,seed=1)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # 保存参数
agent.save(path=cfg.model_path) # 保存模型
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg,seed=1)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -0,0 +1,15 @@
# run Double DQN on CartPole-v0
# source conda, if you are already in proper conda environment, then comment the codes util "conda activate easyrl"
if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then
echo "source file at ~/anaconda3/etc/profile.d/conda.sh"
source ~/anaconda3/etc/profile.d/conda.sh
elif [ -f "$HOME/opt/anaconda3/etc/profile.d/conda.sh" ]; then
echo "source file at ~/opt/anaconda3/etc/profile.d/conda.sh"
source ~/opt/anaconda3/etc/profile.d/conda.sh
else
echo 'please manually config the conda source path'
fi
conda activate easyrl # easyrl here can be changed to another name of conda env that you have created
codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path
python $codes_dir/DoubleDQN/main.py --device cuda