diff --git a/projects/codes/DQN/dqn.py b/projects/codes/DQN/dqn.py index ecfcc36..fce3a73 100644 --- a/projects/codes/DQN/dqn.py +++ b/projects/codes/DQN/dqn.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2022-08-23 23:59:54 +LastEditTime: 2022-08-29 23:30:08 @Discription: @Environment: python 3.7.7 ''' @@ -78,7 +78,7 @@ class DQN: self.batch_size) state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1) - reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize) + reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1) next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states) done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1) # print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape) @@ -91,7 +91,7 @@ class DQN: # compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch) # print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad) - loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to + loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to # backpropagation self.optimizer.zero_grad() loss.backward() diff --git a/projects/codes/DQN/main.py b/projects/codes/DQN/main.py index d3c022c..651a98e 100644 --- a/projects/codes/DQN/main.py +++ b/projects/codes/DQN/main.py @@ -9,130 +9,122 @@ import torch import datetime import numpy as np import argparse -from common.utils import save_results,all_seed -from common.utils import plot_rewards,save_args +from common.utils import all_seed from common.models import MLP from common.memories import ReplayBuffer +from common.launcher import Launcher +from envs.register import register_env from dqn import DQN +class Main(Launcher): + def get_args(self): + """ hyperparameters + """ + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time + parser = argparse.ArgumentParser(description="hyperparameters") + parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm") + parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") + parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") + parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") + parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps") + parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor") + parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") + parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") + parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay") + parser.add_argument('--lr',default=0.0001,type=float,help="learning rate") + parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity") + parser.add_argument('--batch_size',default=64,type=int) + parser.add_argument('--target_update',default=4,type=int) + parser.add_argument('--hidden_dim',default=256,type=int) + parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") + parser.add_argument('--seed',default=10,type=int,help="seed") + parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not") + parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") + # please manually change the following args in this script if you want + parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/results' ) + parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/models' ) + args = parser.parse_args() + args = {**vars(args)} # type(dict) + return args -def get_args(): - """ hyperparameters - """ - curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time - parser = argparse.ArgumentParser(description="hyperparameters") - parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm") - parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") - parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") - parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") - parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps") - parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor") - parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") - parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") - parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay") - parser.add_argument('--lr',default=0.0001,type=float,help="learning rate") - parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity") - parser.add_argument('--batch_size',default=64,type=int) - parser.add_argument('--target_update',default=4,type=int) - parser.add_argument('--hidden_dim',default=256,type=int) - parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") - parser.add_argument('--seed',default=10,type=int,help="seed") - parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not") - parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") - # please manually change the following args in this script if you want - parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ - '/' + curr_time + '/results' ) - parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ - '/' + curr_time + '/models' ) - args = parser.parse_args() - args = {**vars(args)} # type(dict) - return args + def env_agent_config(cfg): + ''' create env and agent + ''' + register_env(cfg['env_name']) + env = gym.make(cfg['env_name']) + if cfg['seed'] !=0: # set random seed + all_seed(env,seed=cfg["seed"]) + try: # state dimension + n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n')) + except AttributeError: + n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape')) + n_actions = env.action_space.n # action dimension + print(f"n_states: {n_states}, n_actions: {n_actions}") + cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters + model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"]) + memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer + agent = DQN(model,memory,cfg) # create agent + return env, agent -def env_agent_config(cfg): - ''' create env and agent - ''' - env = gym.make(cfg['env_name']) # create env - if cfg['seed'] !=0: # set random seed - all_seed(env,seed=cfg["seed"]) - n_states = env.observation_space.shape[0] # state dimension - n_actions = env.action_space.n # action dimension - print(f"n_states: {n_states}, n_actions: {n_actions}") - cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters - model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"]) - memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer - agent = DQN(model,memory,cfg) # create agent - return env, agent + def train(cfg, env, agent): + ''' 训练 + ''' + print("Start training!") + print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") + rewards = [] # record rewards for all episodes + steps = [] + for i_ep in range(cfg["train_eps"]): + ep_reward = 0 # reward per episode + ep_step = 0 + state = env.reset() # reset and obtain initial state + for _ in range(cfg['ep_max_steps']): + ep_step += 1 + action = agent.sample_action(state) # sample action + next_state, reward, done, _ = env.step(action) # update env and return transitions + agent.memory.push(state, action, reward, + next_state, done) # save transitions + state = next_state # update next state for env + agent.update() # update agent + ep_reward += reward # + if done: + break + if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + steps.append(ep_step) + rewards.append(ep_reward) + if (i_ep + 1) % 10 == 0: + print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}') + print("Finish training!") + env.close() + res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} + return res_dic -def train(cfg, env, agent): - ''' 训练 - ''' - print("Start training!") - print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") - rewards = [] # record rewards for all episodes - steps = [] - for i_ep in range(cfg["train_eps"]): - ep_reward = 0 # reward per episode - ep_step = 0 - state = env.reset() # reset and obtain initial state - for _ in range(cfg['ep_max_steps']): - ep_step += 1 - action = agent.sample_action(state) # sample action - next_state, reward, done, _ = env.step(action) # update env and return transitions - agent.memory.push(state, action, reward, - next_state, done) # save transitions - state = next_state # update next state for env - agent.update() # update agent - ep_reward += reward # - if done: - break - if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - steps.append(ep_step) - rewards.append(ep_reward) - if (i_ep + 1) % 10 == 0: - print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}') - print("Finish training!") - env.close() - res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} - return res_dic - -def test(cfg, env, agent): - print("Start testing!") - print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") - rewards = [] # record rewards for all episodes - steps = [] - for i_ep in range(cfg['test_eps']): - ep_reward = 0 # reward per episode - ep_step = 0 - state = env.reset() # reset and obtain initial state - for _ in range(cfg['ep_max_steps']): - ep_step+=1 - action = agent.predict_action(state) # predict action - next_state, reward, done, _ = env.step(action) - state = next_state - ep_reward += reward - if done: - break - steps.append(ep_step) - rewards.append(ep_reward) - print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}") - print("Finish testing!") - env.close() - return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} + def test(cfg, env, agent): + print("Start testing!") + print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") + rewards = [] # record rewards for all episodes + steps = [] + for i_ep in range(cfg['test_eps']): + ep_reward = 0 # reward per episode + ep_step = 0 + state = env.reset() # reset and obtain initial state + for _ in range(cfg['ep_max_steps']): + ep_step+=1 + action = agent.predict_action(state) # predict action + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + break + steps.append(ep_step) + rewards.append(ep_reward) + print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}") + print("Finish testing!") + env.close() + return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} if __name__ == "__main__": - cfg = get_args() - # training - env, agent = env_agent_config(cfg) - res_dic = train(cfg, env, agent) - save_args(cfg,path = cfg['result_path']) # save parameters - agent.save_model(path = cfg['model_path']) # save models - save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results - plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results - # testing - env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step - agent.load_model(path = cfg['model_path']) # load model - res_dic = test(cfg, env, agent) - save_results(res_dic, tag='test', - path = cfg['result_path']) - plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test") + main = Main() + main.run() diff --git a/projects/codes/DQN/outputs/CartPole-v0/20220823-173936/results/params.json b/projects/codes/DQN/outputs/CartPole-v0/20220823-173936/results/params.json index afad0d9..f57e151 100644 --- a/projects/codes/DQN/outputs/CartPole-v0/20220823-173936/results/params.json +++ b/projects/codes/DQN/outputs/CartPole-v0/20220823-173936/results/params.json @@ -1 +1,21 @@ -{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 10, "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", "show_fig": false, "save_fig": true} \ No newline at end of file +{ + "algo_name": "DQN", + "env_name": "CartPole-v0", + "train_eps": 200, + "test_eps": 20, + "gamma": 0.95, + "epsilon_start": 0.95, + "epsilon_end": 0.01, + "epsilon_decay": 500, + "lr": 0.0001, + "memory_capacity": 100000, + "batch_size": 64, + "target_update": 4, + "hidden_dim": 256, + "device": "cpu", + "seed": 10, + "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", + "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", + "show_fig": false, + "save_fig": true +} \ No newline at end of file diff --git a/projects/codes/DQN/outputs/CartPole-v1/20220828-214702/results/params.json b/projects/codes/DQN/outputs/CartPole-v1/20220828-214702/results/params.json index 83d8c57..c87e5de 100644 --- a/projects/codes/DQN/outputs/CartPole-v1/20220828-214702/results/params.json +++ b/projects/codes/DQN/outputs/CartPole-v1/20220828-214702/results/params.json @@ -1 +1,24 @@ -{"algo_name": "DQN", "env_name": "CartPole-v1", "train_eps": 2000, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 6000, "lr": 1e-05, "memory_capacity": 200000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models", "n_states": 4, "n_actions": 2} \ No newline at end of file +{ + "algo_name": "DQN", + "env_name": "CartPole-v1", + "train_eps": 2000, + "test_eps": 20, + "ep_max_steps": 100000, + "gamma": 0.99, + "epsilon_start": 0.95, + "epsilon_end": 0.01, + "epsilon_decay": 6000, + "lr": 1e-05, + "memory_capacity": 200000, + "batch_size": 64, + "target_update": 4, + "hidden_dim": 256, + "device": "cuda", + "seed": 10, + "show_fig": false, + "save_fig": true, + "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results", + "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models", + "n_states": 4, + "n_actions": 2 +} \ No newline at end of file diff --git a/projects/codes/DoubleDQN/double_dqn.py b/projects/codes/DoubleDQN/double_dqn.py index 0488705..b7f4e97 100644 --- a/projects/codes/DoubleDQN/double_dqn.py +++ b/projects/codes/DoubleDQN/double_dqn.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2022-07-21 00:08:26 +LastEditTime: 2022-08-29 23:34:20 @Discription: @Environment: python 3.7.7 ''' @@ -20,148 +20,87 @@ import torch.nn.functional as F import random import math import numpy as np - -class ReplayBuffer: - def __init__(self, capacity): - self.capacity = capacity # 经验回放的容量 - self.buffer = [] # 缓冲区 - self.position = 0 - - def push(self, state, action, reward, next_state, done): - ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) - ''' - if len(self.buffer) < self.capacity: - self.buffer.append(None) - self.buffer[self.position] = (state, action, reward, next_state, done) - self.position = (self.position + 1) % self.capacity - - def sample(self, batch_size): - batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 - state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 - return state, action, reward, next_state, done - - def __len__(self): - ''' 返回当前存储的量 - ''' - return len(self.buffer) - -class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): - """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态维度 - n_actions: 输出的动作维度 - """ - super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 - self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 - - def forward(self, x): - # 各层对应的激活函数 - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - return self.fc3(x) - class DoubleDQN: - def __init__(self, n_states, n_actions, model, memory, cfg): - self.n_actions = n_actions # 总的动作个数 - self.device = torch.device(cfg.device) # 设备,cpu或gpu等 - self.gamma = cfg.gamma - # e-greedy策略相关参数 - self.sample_count = 0 - self.epsilon_start = cfg.epsilon_start - self.epsilon_end = cfg.epsilon_end - self.epsilon_decay = cfg.epsilon_decay - self.batch_size = cfg.batch_size - self.policy_net = model.to(self.device) - self.target_net = model.to(self.device) + def __init__(self,models, memories, cfg): + self.n_actions = cfg['n_actions'] + self.device = torch.device(cfg['device']) + self.gamma = cfg['gamma'] + ## e-greedy parameters + self.sample_count = 0 # sample count for epsilon decay + self.epsilon_start = cfg['epsilon_start'] + self.epsilon_end = cfg['epsilon_end'] + self.epsilon_decay = cfg['epsilon_decay'] + self.batch_size = cfg['batch_size'] + self.policy_net = models['Qnet'].to(self.device) + self.target_net = models['Qnet'].to(self.device) # target_net copy from policy_net for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(param.data) - # self.target_net.eval() # 不启用 BatchNormalization 和 Dropout - # 可查parameters()与state_dict()的区别,前者require_grad=True - self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) - self.loss = 0 - self.memory = memory + # self.target_net.eval() # donnot use BatchNormalization or Dropout + # the difference between parameters() and state_dict() is that parameters() require_grad=True + self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr']) + self.memory = memories['Memory'] + self.update_flag = False - def sample(self, state): - '''选择动作 + def sample_action(self, state): + ''' sample action ''' self.sample_count += 1 self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay) if random.random() > self.epsilon: with torch.no_grad(): - # 先转为张量便于丢给神经网络,state元素数据原本为float64 - # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 - state = torch.tensor( - [state], device=self.device, dtype=torch.float32) - # 如tensor([[-0.0798, -0.0079]], grad_fn=) + state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0) q_value = self.policy_net(state) - # tensor.max(1)返回每行的最大值以及对应的下标, - # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) - # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: action = random.randrange(self.n_actions) return action - def predict(self, state): - '''选择动作 + def predict_action(self, state): + ''' predict action ''' with torch.no_grad(): - state = torch.tensor([state], device=self.device, dtype=torch.float32) + state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0) q_value = self.policy_net(state) action = q_value.max(1)[1].item() return action def update(self): - if len(self.memory) < self.batch_size: # 只有memory满了才会更新 + if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update return - # 从memory中随机采样transition - state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( - self.batch_size) + else: + if not self.update_flag: + print("Begin to update!") + self.update_flag = True + # sample a batch of transitions from replay buffer + state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size) # convert to tensor - state_batch = torch.tensor( - state_batch, device=self.device, dtype=torch.float) - action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( - 1) # 例如tensor([[1],...,[0]]) - reward_batch = torch.tensor( - reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) - next_state_batch = torch.tensor( - next_state_batch, device=self.device, dtype=torch.float) - - done_batch = torch.tensor(np.float32( - done_batch), device=self.device) # 将bool转为float然后转为张量 - # 计算当前(s_t,a)对应的Q(s_t, a) - q_values = self.policy_net(state_batch) - next_q_values = self.policy_net(next_state_batch) - # 代入当前选择的action,得到Q(s_t|a=a_t) - q_value = q_values.gather(dim=1, index=action_batch) - '''以下是Nature DQN的q_target计算方式 - # 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数 - next_q_state_value = self.target_net( - next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) - # 计算 q_target - # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward - q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0]) - ''' - '''以下是Double DQN q_target计算方式,与NatureDQN稍有不同''' - next_target_values = self.target_net( - next_state_batch) - # 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a)) - next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) - q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch) - self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss - # 优化模型 - self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step - # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 - self.loss.backward() - for param in self.policy_net.parameters(): # clip防止梯度爆炸 + state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) + action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1) + reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1) + next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) + done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1) + # compute current Q(s_t|a=a_t) + q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True + next_q_value_batch = self.policy_net(next_state_batch) + '''the following is the way of computing Double DQN expected_q_value,a bit different from Nature DQN''' + next_target_value_batch = self.target_net(next_state_batch) + # choose action a from Q(s_t‘, a), next_target_values obtain next_q_value,which is Q’(s_t|a=argmax Q(s_t‘, a)) + next_target_q_value_batch = next_target_value_batch.gather(1, torch.max(next_q_value_batch, 1)[1].unsqueeze(1)) # shape(batchsize,1) + expected_q_value_batch = reward_batch + self.gamma * next_target_q_value_batch * (1-done_batch) + loss = nn.MSELoss()(q_value_batch , expected_q_value_batch) + self.optimizer.zero_grad() + loss.backward() + # clip to avoid gradient explosion + for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) - self.optimizer.step() # 更新模型 + self.optimizer.step() - def save(self,path): + def save_model(self,path): + from pathlib import Path + # create path + Path(path).mkdir(parents=True, exist_ok=True) torch.save(self.target_net.state_dict(), path+'checkpoint.pth') - def load(self,path): + def load_model(self,path): self.target_net.load_state_dict(torch.load(path+'checkpoint.pth')) for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): param.data.copy_(target_param.data) diff --git a/projects/codes/DoubleDQN/main.py b/projects/codes/DoubleDQN/main.py new file mode 100644 index 0000000..a66025e --- /dev/null +++ b/projects/codes/DoubleDQN/main.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-11-07 18:10:37 +LastEditor: JiangJi +LastEditTime: 2022-08-29 23:33:31 +Discription: +''' +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # current path +parent_path = os.path.dirname(curr_path) # parent path +sys.path.append(parent_path) # add to system path + +import gym +import datetime +import argparse + +from common.utils import all_seed +from common.models import MLP +from common.memories import ReplayBufferQue +from DoubleDQN.double_dqn import DoubleDQN +from common.launcher import Launcher +from envs.register import register_env +class Main(Launcher): + def get_args(self): + ''' hyperparameters + ''' + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time + parser = argparse.ArgumentParser(description="hyperparameters") + parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm") + parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") + parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") + parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") + parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps") + parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor") + parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") + parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") + parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon") + parser.add_argument('--lr',default=0.0001,type=float,help="learning rate") + parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity") + parser.add_argument('--batch_size',default=64,type=int) + parser.add_argument('--target_update',default=4,type=int) + parser.add_argument('--hidden_dim',default=256,type=int) + parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") + parser.add_argument('--seed',default=1,type=int,help="seed") + parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not") + parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") + args = parser.parse_args() + default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/", + 'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/", + } + args = {**vars(args),**default_args} # type(dict) + return args + def env_agent_config(self,cfg): + ''' create env and agent + ''' + register_env(cfg['env_name']) + env = gym.make(cfg['env_name']) + if cfg['seed'] !=0: # set random seed + all_seed(env,seed=cfg["seed"]) + try: # state dimension + n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n')) + except AttributeError: + n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape')) + n_actions = env.action_space.n # action dimension + print(f"n_states: {n_states}, n_actions: {n_actions}") + cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters + models = {'Qnet':MLP(n_states,n_actions,hidden_dim=cfg['hidden_dim'])} + memories = {'Memory':ReplayBufferQue(cfg['memory_capacity'])} + agent = DoubleDQN(models,memories,cfg) + return env,agent + + def train(self,cfg,env,agent): + print("Start training!") + print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") + rewards = [] # record rewards for all episodes + steps = [] + for i_ep in range(cfg["train_eps"]): + ep_reward = 0 # reward per episode + ep_step = 0 + state = env.reset() # reset and obtain initial state + for _ in range(cfg['ep_max_steps']): + action = agent.sample_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push((state, action, reward, next_state, done)) + state = next_state + agent.update() + if done: + break + if i_ep % cfg['target_update'] == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + steps.append(ep_step) + rewards.append(ep_reward) + if (i_ep+1)%10 == 0: + print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}') + print("Finish training!") + env.close() + res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} + return res_dic + + def test(self,cfg,env,agent): + print("Start testing!") + print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") + rewards = [] # record rewards for all episodes + steps = [] + for i_ep in range(cfg['test_eps']): + ep_reward = 0 # reward per episode + ep_step = 0 + state = env.reset() # reset and obtain initial state + for _ in range(cfg['ep_max_steps']): + action = agent.predict_action(state) + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + break + steps.append(ep_step) + rewards.append(ep_reward) + print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}") + print("Finish testing!") + env.close() + return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} + +if __name__ == "__main__": + main = Main() + main.run() diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/checkpoint.pth b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/checkpoint.pth deleted file mode 100644 index 2d4c362..0000000 Binary files a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/checkpoint.pth and /dev/null differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/params.json b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/params.json deleted file mode 100644 index 6f83ede..0000000 --- a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/params.json +++ /dev/null @@ -1 +0,0 @@ -{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true} \ No newline at end of file diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/test_rewards.npy b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/test_rewards.npy deleted file mode 100644 index c215808..0000000 Binary files a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/test_rewards.npy and /dev/null differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/test_rewards_curve.png b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/test_rewards_curve.png deleted file mode 100644 index 7b66b67..0000000 Binary files a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/test_rewards_curve.png and /dev/null differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/train_rewards.npy b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/train_rewards.npy deleted file mode 100644 index 654d71d..0000000 Binary files a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/train_rewards.npy and /dev/null differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/train_rewards_curve.png b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/train_rewards_curve.png deleted file mode 100644 index dbf66d7..0000000 Binary files a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/train_rewards_curve.png and /dev/null differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/models/checkpoint.pth b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/models/checkpoint.pth new file mode 100644 index 0000000..d402ba1 Binary files /dev/null and b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/models/checkpoint.pth differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/params.json b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/params.json new file mode 100644 index 0000000..91df006 --- /dev/null +++ b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/params.json @@ -0,0 +1 @@ +{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/models/", "n_states": 4, "n_actions": 2} \ No newline at end of file diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/testing_curve.png b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/testing_curve.png new file mode 100644 index 0000000..fe21c95 Binary files /dev/null and b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/testing_curve.png differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/testing_results.csv b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/testing_results.csv new file mode 100644 index 0000000..2a504ee --- /dev/null +++ b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/testing_results.csv @@ -0,0 +1,21 @@ +episodes,rewards,steps +0,145.0,0 +1,166.0,0 +2,171.0,0 +3,200.0,0 +4,139.0,0 +5,200.0,0 +6,200.0,0 +7,141.0,0 +8,200.0,0 +9,187.0,0 +10,166.0,0 +11,172.0,0 +12,121.0,0 +13,200.0,0 +14,200.0,0 +15,149.0,0 +16,128.0,0 +17,200.0,0 +18,178.0,0 +19,185.0,0 diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/training_curve.png b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/training_curve.png new file mode 100644 index 0000000..a8475ea Binary files /dev/null and b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/training_curve.png differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/training_results.csv b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/training_results.csv new file mode 100644 index 0000000..8f87049 --- /dev/null +++ b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233435/results/training_results.csv @@ -0,0 +1,201 @@ +episodes,rewards,steps +0,19.0,0 +1,16.0,0 +2,17.0,0 +3,11.0,0 +4,10.0,0 +5,27.0,0 +6,16.0,0 +7,9.0,0 +8,20.0,0 +9,21.0,0 +10,15.0,0 +11,10.0,0 +12,14.0,0 +13,37.0,0 +14,12.0,0 +15,10.0,0 +16,27.0,0 +17,33.0,0 +18,19.0,0 +19,13.0,0 +20,26.0,0 +21,15.0,0 +22,29.0,0 +23,11.0,0 +24,20.0,0 +25,23.0,0 +26,23.0,0 +27,26.0,0 +28,17.0,0 +29,33.0,0 +30,16.0,0 +31,48.0,0 +32,48.0,0 +33,69.0,0 +34,58.0,0 +35,24.0,0 +36,18.0,0 +37,28.0,0 +38,12.0,0 +39,12.0,0 +40,18.0,0 +41,12.0,0 +42,13.0,0 +43,21.0,0 +44,30.0,0 +45,32.0,0 +46,22.0,0 +47,18.0,0 +48,12.0,0 +49,12.0,0 +50,20.0,0 +51,32.0,0 +52,15.0,0 +53,100.0,0 +54,26.0,0 +55,25.0,0 +56,18.0,0 +57,15.0,0 +58,35.0,0 +59,12.0,0 +60,65.0,0 +61,27.0,0 +62,29.0,0 +63,22.0,0 +64,83.0,0 +65,24.0,0 +66,28.0,0 +67,15.0,0 +68,43.0,0 +69,13.0,0 +70,22.0,0 +71,46.0,0 +72,14.0,0 +73,32.0,0 +74,44.0,0 +75,53.0,0 +76,31.0,0 +77,51.0,0 +78,61.0,0 +79,30.0,0 +80,36.0,0 +81,30.0,0 +82,48.0,0 +83,26.0,0 +84,27.0,0 +85,43.0,0 +86,20.0,0 +87,87.0,0 +88,71.0,0 +89,43.0,0 +90,57.0,0 +91,40.0,0 +92,37.0,0 +93,43.0,0 +94,31.0,0 +95,45.0,0 +96,47.0,0 +97,52.0,0 +98,48.0,0 +99,98.0,0 +100,49.0,0 +101,98.0,0 +102,68.0,0 +103,70.0,0 +104,74.0,0 +105,73.0,0 +106,127.0,0 +107,92.0,0 +108,70.0,0 +109,97.0,0 +110,66.0,0 +111,112.0,0 +112,138.0,0 +113,81.0,0 +114,74.0,0 +115,153.0,0 +116,113.0,0 +117,88.0,0 +118,138.0,0 +119,200.0,0 +120,84.0,0 +121,123.0,0 +122,158.0,0 +123,171.0,0 +124,137.0,0 +125,143.0,0 +126,170.0,0 +127,127.0,0 +128,118.0,0 +129,200.0,0 +130,189.0,0 +131,149.0,0 +132,137.0,0 +133,115.0,0 +134,153.0,0 +135,136.0,0 +136,140.0,0 +137,169.0,0 +138,187.0,0 +139,200.0,0 +140,196.0,0 +141,200.0,0 +142,200.0,0 +143,137.0,0 +144,200.0,0 +145,185.0,0 +146,200.0,0 +147,164.0,0 +148,200.0,0 +149,143.0,0 +150,143.0,0 +151,112.0,0 +152,192.0,0 +153,200.0,0 +154,144.0,0 +155,188.0,0 +156,200.0,0 +157,133.0,0 +158,200.0,0 +159,143.0,0 +160,158.0,0 +161,161.0,0 +162,169.0,0 +163,176.0,0 +164,200.0,0 +165,149.0,0 +166,156.0,0 +167,200.0,0 +168,200.0,0 +169,200.0,0 +170,134.0,0 +171,171.0,0 +172,200.0,0 +173,200.0,0 +174,200.0,0 +175,194.0,0 +176,200.0,0 +177,138.0,0 +178,159.0,0 +179,187.0,0 +180,200.0,0 +181,192.0,0 +182,200.0,0 +183,200.0,0 +184,200.0,0 +185,173.0,0 +186,200.0,0 +187,178.0,0 +188,176.0,0 +189,196.0,0 +190,200.0,0 +191,195.0,0 +192,158.0,0 +193,156.0,0 +194,200.0,0 +195,200.0,0 +196,200.0,0 +197,200.0,0 +198,193.0,0 +199,200.0,0 diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/models/checkpoint.pth b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/models/checkpoint.pth new file mode 100644 index 0000000..01e8c46 Binary files /dev/null and b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/models/checkpoint.pth differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/params.json b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/params.json new file mode 100644 index 0000000..2d2c2ca --- /dev/null +++ b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/params.json @@ -0,0 +1 @@ +{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/models/", "n_states": 4, "n_actions": 2} \ No newline at end of file diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/testing_curve.png b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/testing_curve.png new file mode 100644 index 0000000..288ee92 Binary files /dev/null and b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/testing_curve.png differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/testing_results.csv b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/testing_results.csv new file mode 100644 index 0000000..6e8adb7 --- /dev/null +++ b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/testing_results.csv @@ -0,0 +1,21 @@ +episodes,rewards,steps +0,200.0,0 +1,200.0,0 +2,200.0,0 +3,200.0,0 +4,191.0,0 +5,200.0,0 +6,200.0,0 +7,179.0,0 +8,200.0,0 +9,200.0,0 +10,200.0,0 +11,190.0,0 +12,147.0,0 +13,197.0,0 +14,200.0,0 +15,200.0,0 +16,167.0,0 +17,200.0,0 +18,200.0,0 +19,200.0,0 diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/training_curve.png b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/training_curve.png new file mode 100644 index 0000000..544de6e Binary files /dev/null and b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/training_curve.png differ diff --git a/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/training_results.csv b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/training_results.csv new file mode 100644 index 0000000..67bdb9e --- /dev/null +++ b/projects/codes/DoubleDQN/outputs/CartPole-v0/20220829-233635/results/training_results.csv @@ -0,0 +1,201 @@ +episodes,rewards,steps +0,19.0,0 +1,16.0,0 +2,17.0,0 +3,11.0,0 +4,10.0,0 +5,27.0,0 +6,55.0,0 +7,17.0,0 +8,23.0,0 +9,9.0,0 +10,17.0,0 +11,14.0,0 +12,17.0,0 +13,12.0,0 +14,14.0,0 +15,16.0,0 +16,27.0,0 +17,36.0,0 +18,17.0,0 +19,17.0,0 +20,21.0,0 +21,23.0,0 +22,13.0,0 +23,12.0,0 +24,17.0,0 +25,26.0,0 +26,25.0,0 +27,17.0,0 +28,10.0,0 +29,16.0,0 +30,14.0,0 +31,19.0,0 +32,23.0,0 +33,37.0,0 +34,29.0,0 +35,22.0,0 +36,29.0,0 +37,15.0,0 +38,16.0,0 +39,18.0,0 +40,23.0,0 +41,16.0,0 +42,26.0,0 +43,13.0,0 +44,24.0,0 +45,39.0,0 +46,23.0,0 +47,32.0,0 +48,123.0,0 +49,18.0,0 +50,39.0,0 +51,17.0,0 +52,28.0,0 +53,34.0,0 +54,26.0,0 +55,61.0,0 +56,28.0,0 +57,16.0,0 +58,45.0,0 +59,41.0,0 +60,49.0,0 +61,18.0,0 +62,40.0,0 +63,24.0,0 +64,37.0,0 +65,26.0,0 +66,51.0,0 +67,17.0,0 +68,152.0,0 +69,17.0,0 +70,29.0,0 +71,37.0,0 +72,15.0,0 +73,55.0,0 +74,152.0,0 +75,23.0,0 +76,45.0,0 +77,30.0,0 +78,39.0,0 +79,20.0,0 +80,53.0,0 +81,49.0,0 +82,71.0,0 +83,115.0,0 +84,41.0,0 +85,52.0,0 +86,52.0,0 +87,36.0,0 +88,84.0,0 +89,122.0,0 +90,49.0,0 +91,200.0,0 +92,67.0,0 +93,87.0,0 +94,183.0,0 +95,132.0,0 +96,76.0,0 +97,200.0,0 +98,200.0,0 +99,200.0,0 +100,200.0,0 +101,200.0,0 +102,106.0,0 +103,192.0,0 +104,111.0,0 +105,95.0,0 +106,200.0,0 +107,200.0,0 +108,148.0,0 +109,200.0,0 +110,97.0,0 +111,200.0,0 +112,200.0,0 +113,105.0,0 +114,135.0,0 +115,200.0,0 +116,144.0,0 +117,156.0,0 +118,200.0,0 +119,200.0,0 +120,166.0,0 +121,200.0,0 +122,200.0,0 +123,200.0,0 +124,200.0,0 +125,200.0,0 +126,200.0,0 +127,158.0,0 +128,139.0,0 +129,200.0,0 +130,200.0,0 +131,200.0,0 +132,200.0,0 +133,122.0,0 +134,200.0,0 +135,188.0,0 +136,200.0,0 +137,183.0,0 +138,200.0,0 +139,200.0,0 +140,200.0,0 +141,200.0,0 +142,200.0,0 +143,158.0,0 +144,200.0,0 +145,200.0,0 +146,200.0,0 +147,191.0,0 +148,200.0,0 +149,194.0,0 +150,178.0,0 +151,200.0,0 +152,200.0,0 +153,200.0,0 +154,162.0,0 +155,200.0,0 +156,200.0,0 +157,128.0,0 +158,200.0,0 +159,184.0,0 +160,194.0,0 +161,200.0,0 +162,200.0,0 +163,200.0,0 +164,200.0,0 +165,160.0,0 +166,163.0,0 +167,200.0,0 +168,200.0,0 +169,200.0,0 +170,141.0,0 +171,200.0,0 +172,200.0,0 +173,200.0,0 +174,200.0,0 +175,200.0,0 +176,200.0,0 +177,157.0,0 +178,164.0,0 +179,200.0,0 +180,200.0,0 +181,200.0,0 +182,200.0,0 +183,200.0,0 +184,200.0,0 +185,193.0,0 +186,182.0,0 +187,200.0,0 +188,200.0,0 +189,200.0,0 +190,200.0,0 +191,200.0,0 +192,174.0,0 +193,178.0,0 +194,200.0,0 +195,200.0,0 +196,200.0,0 +197,200.0,0 +198,200.0,0 +199,200.0,0 diff --git a/projects/codes/DoubleDQN/task0.py b/projects/codes/DoubleDQN/task0.py deleted file mode 100644 index 7451d24..0000000 --- a/projects/codes/DoubleDQN/task0.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-11-07 18:10:37 -LastEditor: JiangJi -LastEditTime: 2022-07-21 21:52:31 -Discription: -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # current path -parent_path = os.path.dirname(curr_path) # parent path -sys.path.append(parent_path) # add to system path - -import gym -import torch -import datetime -import argparse - -from common.utils import save_results,make_dir -from common.utils import plot_rewards,save_args -from common.models import MLP -from common.memories import ReplayBuffer -from DoubleDQN.double_dqn import DoubleDQN - -def get_args(): - """ 超参数 - """ - curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 - parser = argparse.ArgumentParser(description="hyperparameters") - parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm") - parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") - parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") - parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") - parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor") - parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") - parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") - parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon") - parser.add_argument('--lr',default=0.0001,type=float,help="learning rate") - parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity") - parser.add_argument('--batch_size',default=64,type=int) - parser.add_argument('--target_update',default=4,type=int) - parser.add_argument('--hidden_dim',default=256,type=int) - parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") - parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ - '/' + curr_time + '/results/' ) - parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ - '/' + curr_time + '/models/' ) # 保存模型的路径 - parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") - args = parser.parse_args() - return args - - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env_name) - env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim) - memory = ReplayBuffer(cfg.memory_capacity) - agent = DoubleDQN(n_states,n_actions,model,memory,cfg) - return env,agent - -def train(cfg,env,agent): - print("开始训练!") - print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}") - rewards = [] # 记录所有回合的奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.sample(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() - if done: - break - if i_ep % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - if (i_ep+1)%10 == 0: - print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f},Epislon:{agent.epsilon:.3f}') - rewards.append(ep_reward) - print("完成训练!") - return {'rewards':rewards} - -def test(cfg,env,agent): - print("开始测试!") - print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}") - rewards = [] # 记录所有回合的奖励 - for i_ep in range(cfg.test_eps): - state = env.reset() - ep_reward = 0 - while True: - action = agent.predict(state) - next_state, reward, done, _ = env.step(action) - state = next_state - ep_reward += reward - if done: - break - rewards.append(ep_reward) - print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}') - print("完成测试!") - return {'rewards':rewards} - -if __name__ == "__main__": - cfg = get_args() - # 训练 - env, agent = env_agent_config(cfg,seed=1) - res_dic = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - save_args(cfg) # 保存参数 - agent.save(path=cfg.model_path) # 保存模型 - save_results(res_dic, tag='train', - path=cfg.result_path) - plot_rewards(res_dic['rewards'], cfg, tag="train") - # 测试 - env, agent = env_agent_config(cfg,seed=1) - agent.load(path=cfg.model_path) # 导入模型 - res_dic = test(cfg, env, agent) - save_results(res_dic, tag='test', - path=cfg.result_path) # 保存结果 - plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果 diff --git a/projects/codes/scripts/DoubleDQN_CartPole-v0.sh b/projects/codes/scripts/DoubleDQN_CartPole-v0.sh new file mode 100644 index 0000000..0da88f2 --- /dev/null +++ b/projects/codes/scripts/DoubleDQN_CartPole-v0.sh @@ -0,0 +1,15 @@ +# run Double DQN on CartPole-v0 +# source conda, if you are already in proper conda environment, then comment the codes util "conda activate easyrl" + +if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then + echo "source file at ~/anaconda3/etc/profile.d/conda.sh" + source ~/anaconda3/etc/profile.d/conda.sh +elif [ -f "$HOME/opt/anaconda3/etc/profile.d/conda.sh" ]; then + echo "source file at ~/opt/anaconda3/etc/profile.d/conda.sh" + source ~/opt/anaconda3/etc/profile.d/conda.sh +else + echo 'please manually config the conda source path' +fi +conda activate easyrl # easyrl here can be changed to another name of conda env that you have created +codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path +python $codes_dir/DoubleDQN/main.py --device cuda \ No newline at end of file