diff --git a/.gitignore b/.gitignore index cb8735f..a1c0613 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_STORE __pycache__ -.vscode \ No newline at end of file +.vscode +test.py \ No newline at end of file diff --git a/codes/A2C/test.py b/codes/A2C/test.py deleted file mode 100644 index 36aef44..0000000 --- a/codes/A2C/test.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2021-03-20 17:43:17 -LastEditor: John -LastEditTime: 2021-04-05 11:19:20 -Discription: -Environment: -''' -import sys -import torch -import gym -import numpy as np -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from torch.autograd import Variable -import matplotlib.pyplot as plt -import pandas as pd - - -learning_rate = 3e-4 - -# Constants -GAMMA = 0.99 - -class A2CConfig: - ''' hyperparameters - ''' - def __init__(self): - self.gamma = 0.99 - self.lr = 3e-4 # learnning rate - self.actor_lr = 1e-4 # learnning rate of actor network - self.memory_capacity = 10000 # capacity of replay memory - self.batch_size = 128 - self.train_eps = 3000 - self.train_steps = 200 - self.eval_eps = 200 - self.eval_steps = 200 - self.target_update = 4 - self.hidden_dim = 256 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, learning_rate=3e-4): - super(ActorCritic, self).__init__() - - self.n_actions = n_actions - self.critic_linear1 = nn.Linear(n_states, hidden_dim) - self.critic_linear2 = nn.Linear(hidden_dim, 1) - - self.actor_linear1 = nn.Linear(n_states, hidden_dim) - self.actor_linear2 = nn.Linear(hidden_dim, n_actions) - - def forward(self, state): - state = Variable(torch.from_numpy(state).float().unsqueeze(0)) - value = F.relu(self.critic_linear1(state)) - value = self.critic_linear2(value) - policy_dist = F.relu(self.actor_linear1(state)) - policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1) - - return value, policy_dist - -class A2C: - def __init__(self,n_states,n_actions,cfg): - self.model = ActorCritic(n_states, n_actions, cfg.hidden_dim) - self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr) - def choose_action(self,state): - pass - def update(self): - pass - -def train(cfg,env,agent): - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - actor_critic = ActorCritic(n_states, n_actions, cfg.hidden_dim) - ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate) - - all_lengths = [] - average_lengths = [] - all_rewards = [] - entropy_term = 0 - - for episode in range(cfg.train_eps): - log_probs = [] - values = [] - rewards = [] - state = env.reset() - for steps in range(cfg.train_steps): - value, policy_dist = actor_critic.forward(state) - value = value.detach().numpy()[0,0] - dist = policy_dist.detach().numpy() - - action = np.random.choice(n_actions, p=np.squeeze(dist)) - log_prob = torch.log(policy_dist.squeeze(0)[action]) - entropy = -np.sum(np.mean(dist) * np.log(dist)) - new_state, reward, done, _ = env.step(action) - - rewards.append(reward) - values.append(value) - log_probs.append(log_prob) - entropy_term += entropy - state = new_state - - if done or steps == cfg.train_steps-1: - Qval, _ = actor_critic.forward(new_state) - Qval = Qval.detach().numpy()[0,0] - all_rewards.append(np.sum(rewards)) - all_lengths.append(steps) - average_lengths.append(np.mean(all_lengths[-10:])) - if episode % 10 == 0: - sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps+1, average_lengths[-1])) - break - - # compute Q values - Qvals = np.zeros_like(values) - for t in reversed(range(len(rewards))): - Qval = rewards[t] + GAMMA * Qval - Qvals[t] = Qval - - #update actor critic - values = torch.FloatTensor(values) - Qvals = torch.FloatTensor(Qvals) - log_probs = torch.stack(log_probs) - - advantage = Qvals - values - actor_loss = (-log_probs * advantage).mean() - critic_loss = 0.5 * advantage.pow(2).mean() - ac_loss = actor_loss + critic_loss + 0.001 * entropy_term - - ac_optimizer.zero_grad() - ac_loss.backward() - ac_optimizer.step() - - - - # Plot results - smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean() - smoothed_rewards = [elem for elem in smoothed_rewards] - plt.plot(all_rewards) - plt.plot(smoothed_rewards) - plt.plot() - plt.xlabel('Episode') - plt.ylabel('Reward') - plt.show() - - plt.plot(all_lengths) - plt.plot(average_lengths) - plt.xlabel('Episode') - plt.ylabel('Episode length') - plt.show() - -if __name__ == "__main__": - cfg = A2CConfig() - env = gym.make("CartPole-v0") - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = A2C(n_states,n_actions,cfg) - train(cfg,env,agent) \ No newline at end of file diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py index b080c15..0a8fd30 100644 --- a/codes/DDPG/agent.py +++ b/codes/DDPG/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-09 20:25:52 @LastEditor: John -LastEditTime: 2021-03-31 00:56:32 +LastEditTime: 2021-05-04 14:50:17 @Discription: @Environment: python 3.7.7 ''' @@ -26,6 +26,7 @@ class DDPG: self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + # copy parameters to target net for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): @@ -42,7 +43,6 @@ class DDPG: def choose_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor(state) - # torch.detach()用于切断反向传播 return action.detach().cpu().numpy()[0, 0] def update(self): @@ -50,13 +50,13 @@ class DDPG: return state, action, reward, next_state, done = self.memory.sample( self.batch_size) - # 将所有变量转为张量 + # convert variables to Tensor state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) - # 注意critic将(s_t,a)作为输入 + policy_loss = self.critic(state, self.actor(state)) policy_loss = -policy_loss.mean() next_action = self.target_actor(next_state) diff --git a/codes/DDPG/main.py b/codes/DDPG/main.py deleted file mode 100644 index 736178b..0000000 --- a/codes/DDPG/main.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 20:58:21 -@LastEditor: John -LastEditTime: 2021-04-29 01:58:50 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -from pathlib import Path -import sys,os -curr_path = os.path.dirname(__file__) -parent_path=os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import torch -import gym -import numpy as np -import datetime -from DDPG.agent import DDPG -from DDPG.env import NormalizedActions,OUNoise -from common.plot import plot_rewards -from common.utils import save_results - -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time -SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model -if not os.path.exists(curr_path+"/saved_model/"): os.mkdir(curr_path+"/saved_model/") -if not os.path.exists(SAVED_MODEL_PATH): os.mkdir(SAVED_MODEL_PATH) -RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards -if not os.path.exists(curr_path+"/results/"): os.mkdir(curr_path+"/results/") -if not os.path.exists(RESULT_PATH): os.mkdir(RESULT_PATH) - -class DDPGConfig: - def __init__(self): - self.env = 'Pendulum-v0' - self.algo = 'DDPG' - self.gamma = 0.99 - self.critic_lr = 1e-3 - self.actor_lr = 1e-4 - self.memory_capacity = 10000 - self.batch_size = 128 - self.train_eps =300 - self.eval_eps = 200 - self.eval_steps = 200 - self.target_update = 4 - self.hidden_dim = 30 - self.soft_tau=1e-2 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -def train(cfg,env,agent): - print('Start to train ! ') - ou_noise = OUNoise(env.action_space) # action noise - rewards = [] - ma_rewards = [] # moving average rewards - ep_steps = [] - for i_episode in range(cfg.train_eps): - state = env.reset() - ou_noise.reset() - done = False - ep_reward = 0 - i_step = 0 - while not done: - i_step += 1 - action = agent.choose_action(state) - action = ou_noise.get_action(action, i_step) # 即paper中的random process - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - agent.update() - state = next_state - print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward)) - ep_steps.append(i_step) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = DDPGConfig() - env = NormalizedActions(gym.make("Pendulum-v0")) - env.seed(1) # 设置env随机种子 - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.shape[0] - agent = DDPG(state_dim,action_dim,cfg) - rewards,ma_rewards = train(cfg,env,agent) - agent.save(path=SAVED_MODEL_PATH) - save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) - plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) - \ No newline at end of file diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/models/checkpoint.pt b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/models/checkpoint.pt new file mode 100644 index 0000000..be79646 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/models/checkpoint.pt differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_ma_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_ma_rewards.npy new file mode 100644 index 0000000..7062ae6 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_ma_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_rewards.npy new file mode 100644 index 0000000..f5156f8 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_rewards_curve.png b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_rewards_curve.png new file mode 100644 index 0000000..53589b0 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/eval_rewards_curve.png differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_ma_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_ma_rewards.npy new file mode 100644 index 0000000..e2d734b Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_ma_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_rewards.npy new file mode 100644 index 0000000..092936c Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_rewards_curve.png b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_rewards_curve.png new file mode 100644 index 0000000..60e508a Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210504-024530/results/train_rewards_curve.png differ diff --git a/codes/DDPG/results/20210331-010047/ma_rewards_train.npy b/codes/DDPG/results/20210331-010047/ma_rewards_train.npy deleted file mode 100644 index 6d3572e..0000000 Binary files a/codes/DDPG/results/20210331-010047/ma_rewards_train.npy and /dev/null differ diff --git a/codes/DDPG/results/20210331-010047/rewards_curve_train.png b/codes/DDPG/results/20210331-010047/rewards_curve_train.png deleted file mode 100644 index f2046a5..0000000 Binary files a/codes/DDPG/results/20210331-010047/rewards_curve_train.png and /dev/null differ diff --git a/codes/DDPG/results/20210331-010047/rewards_train.npy b/codes/DDPG/results/20210331-010047/rewards_train.npy deleted file mode 100644 index 72a95cc..0000000 Binary files a/codes/DDPG/results/20210331-010047/rewards_train.npy and /dev/null differ diff --git a/codes/DDPG/saved_model/20210331-010047/checkpoint.pt b/codes/DDPG/saved_model/20210331-010047/checkpoint.pt deleted file mode 100644 index 85ddc28..0000000 Binary files a/codes/DDPG/saved_model/20210331-010047/checkpoint.pt and /dev/null differ diff --git a/codes/DDPG/task0_train.py b/codes/DDPG/task0_train.py new file mode 100644 index 0000000..50e2723 --- /dev/null +++ b/codes/DDPG/task0_train.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +@Author: John +@Email: johnjim0816@gmail.com +@Date: 2020-06-11 20:58:21 +@LastEditor: John +LastEditTime: 2021-05-04 14:49:45 +@Discription: +@Environment: python 3.7.7 +''' +import sys,os +curr_path = os.path.dirname(__file__) +parent_path = os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + +import datetime +import gym +import torch + +from DDPG.env import NormalizedActions, OUNoise +from DDPG.agent import DDPG +from common.utils import save_results,make_dir +from common.plot import plot_rewards + +curr_time = datetime.datetime.now().strftime( + "%Y%m%d-%H%M%S") # obtain current time + + +class DDPGConfig: + def __init__(self): + self.algo = 'DDPG' + self.env = 'Pendulum-v0' # env name + self.result_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/models/' # path to save results + self.gamma = 0.99 + self.critic_lr = 1e-3 + self.actor_lr = 1e-4 + self.memory_capacity = 10000 + self.batch_size = 128 + self.train_eps = 300 + self.eval_eps = 50 + self.eval_steps = 200 + self.target_update = 4 + self.hidden_dim = 30 + self.soft_tau = 1e-2 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") + +def env_agent_config(cfg,seed=1): + env = NormalizedActions(gym.make(cfg.env)) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + agent = DDPG(state_dim,action_dim,cfg) + return env,agent + +def train(cfg, env, agent): + print('Start to train ! ') + print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') + ou_noise = OUNoise(env.action_space) # action noise + rewards = [] + ma_rewards = [] # moving average rewards + for i_episode in range(cfg.train_eps): + state = env.reset() + ou_noise.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + action = ou_noise.get_action( + action, i_step) # 即paper中的random process + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + agent.update() + state = next_state + print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('Complete training!') + return rewards, ma_rewards + +def eval(cfg, env, agent): + print('Start to Eval ! ') + print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') + rewards = [] + ma_rewards = [] # moving average rewards + for i_episode in range(cfg.eval_eps): + state = env.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + state = next_state + print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('Complete Eval!') + return rewards, ma_rewards + + +if __name__ == "__main__": + cfg = DDPGConfig() + + # train + env,agent = env_agent_config(cfg,seed=1) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(cfg.result_path, cfg.model_path) + agent.save(path=cfg.model_path) + save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) + plot_rewards(rewards, ma_rewards, tag="train", + algo=cfg.algo, path=cfg.result_path) + + # eval + env,agent = env_agent_config(cfg,seed=10) + agent.load(path=cfg.model_path) + rewards,ma_rewards = eval(cfg,env,agent) + save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) + plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + diff --git a/codes/DQN/task0_train.py b/codes/DQN/task0_train.py index fc13983..75e1d91 100644 --- a/codes/DQN/task0_train.py +++ b/codes/DQN/task0_train.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2021-04-29 22:23:38 +LastEditTime: 2021-05-04 15:01:34 @Discription: @Environment: python 3.7.7 ''' @@ -18,16 +18,13 @@ import datetime import torch import gym -from common.utils import save_results, make_dir, del_empty_dir +from common.utils import save_results, make_dir from common.plot import plot_rewards from DQN.agent import DQN - - curr_time = datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") # obtain current time - class DQNConfig: def __init__(self): self.algo = "DQN" # name of algo @@ -80,7 +77,7 @@ def train(cfg, env, agent): agent.target_net.load_state_dict(agent.policy_net.state_dict()) print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) - # 计算滑动窗口的reward + # save ma rewards if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: diff --git a/codes/DQN_cnn/README.md b/codes/DQN_cnn/README.md deleted file mode 100644 index 4d1be2a..0000000 --- a/codes/DQN_cnn/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# DQN with cnn -原理与[DQN](../DQN)相同,只是将神经网络换成卷积神经网络,用于二维观测信息(state或obervation) \ No newline at end of file diff --git a/codes/DQN_cnn/agent.py b/codes/DQN_cnn/agent.py deleted file mode 100644 index de2021c..0000000 --- a/codes/DQN_cnn/agent.py +++ /dev/null @@ -1,107 +0,0 @@ -import random -import math -import torch -import torch.optim as optim -import torch.nn.functional as F -from DQN_cnn.memory import ReplayBuffer -from DQN_cnn.model import CNN - - -class DQNcnn: - def __init__(self, screen_height,screen_width, action_dim, cfg): - - self.device = cfg.device - self.action_dim = action_dim - self.gamma = cfg.gamma - # e-greedy策略相关参数 - self.actions_count = 0 - self.epsilon = 0 - self.epsilon_start = cfg.epsilon_start - self.epsilon_end = cfg.epsilon_end - self.epsilon_decay = cfg.epsilon_decay - self.batch_size = cfg.batch_size - self.policy_net = CNN(screen_height, screen_width, - action_dim).to(self.device) - self.target_net = CNN(screen_height, screen_width, - action_dim).to(self.device) - self.target_net.load_state_dict(self.policy_net.state_dict()) # target_net的初始模型参数完全复制policy_net - self.target_net.eval() # 不启用 BatchNormalization 和 Dropout - self.optimizer = optim.RMSprop(self.policy_net.parameters(),lr = cfg.lr) # 可查parameters()与state_dict()的区别,前者require_grad=True - self.loss = 0 - self.memory = ReplayBuffer(cfg.memory_capacity) - - - def choose_action(self, state): - '''选择动作 - Args: - state [array]: [description] - Returns: - action [array]: [description] - ''' - self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ - math.exp(-1. * self.actions_count / self.epsilon_decay) - self.actions_count += 1 - if random.random() > self.epsilon: - with torch.no_grad(): - q_value = self.policy_net(state) # q_value比如tensor([[-0.2522, 0.3887]]) - # tensor.max(1)返回每行的最大值以及对应的下标, - # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) - # 所以tensor.max(1)[1]返回最大值对应的下标,即action - action = q_value.max(1)[1].view(1, 1) # 注意这里action是个张量,如tensor([1]) - return action - else: - return torch.tensor([[random.randrange(self.action_dim)]], device=self.device, dtype=torch.long) - - def update(self): - if len(self.memory) < self.batch_size: - return - transitions = self.memory.sample(self.batch_size) - # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for - # detailed explanation). This converts batch-array of Transitions - # to Transition of batch-arrays. - batch = self.memory.Transition(*zip(*transitions)) - - # Compute a mask of non-final states and concatenate the batch elements - # (a final state would've been the one after which simulation ended) - non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, - batch.state_)), device=self.device, dtype=torch.bool) - - non_final_state_s = torch.cat([s for s in batch.state_ - if s is not None]) - state_batch = torch.cat(batch.state) - action_batch = torch.cat(batch.action) - reward_batch = torch.cat(batch.reward) # tensor([1., 1.,...,]) - - - # Compute Q(s_t, a) - the model computes Q(s_t), then we select the - # columns of actions taken. These are the actions which would've been taken - # for each batch state according to policy_net - state_action_values = self.policy_net( - state_batch).gather(1, action_batch) #tensor([[ 1.1217],...,[ 0.8314]]) - - # Compute V(s_{t+1}) for all next states. - # Expected values of actions for non_final_state_s are computed based - # on the "older" target_net; selecting their best reward with max(1)[0]. - # This is merged based on the mask, such that we'll have either the expected - # state value or 0 in case the state was final. - state__values = torch.zeros(self.batch_size, device=self.device) - - state__values[non_final_mask] = self.target_net( - non_final_state_s).max(1)[0].detach() - - # Compute the expected Q values - expected_state_action_values = (state__values * self.gamma) + reward_batch # tensor([0.9685, 0.9683,...,]) - - # Compute Huber loss - self.loss = F.smooth_l1_loss( - state_action_values, expected_state_action_values.unsqueeze(1)) # .unsqueeze增加一个维度 - # Optimize the model - self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). - self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. - for param in self.policy_net.parameters(): # clip防止梯度爆炸 - param.grad.data.clamp_(-1, 1) - self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters. - - -if __name__ == "__main__": - dqn = DQN() diff --git a/codes/DQN_cnn/env.py b/codes/DQN_cnn/env.py deleted file mode 100644 index 402eead..0000000 --- a/codes/DQN_cnn/env.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 10:02:35 -@LastEditor: John -@LastEditTime: 2020-06-11 16:57:34 -@Discription: -@Environment: python 3.7.7 -''' - -import numpy as np -import torch -import torchvision.transforms as T -from PIL import Image - -resize = T.Compose([T.ToPILImage(), - T.Resize(40, interpolation=Image.CUBIC), - T.ToTensor()]) - - -def get_cart_location(env,screen_width): - world_width = env.x_threshold * 2 - scale = screen_width / world_width - return int(env.state[0] * scale + screen_width / 2.0) # MIDDLE OF CART - -def get_screen(env,device): - # Returned screen requested by gym is 400x600x3, but is sometimes larger - # such as 800x1200x3. Transpose it into torch order (CHW). - screen = env.render(mode='rgb_array').transpose((2, 0, 1)) - # Cart is in the lower half, so strip off the top and bottom of the screen - _, screen_height, screen_width = screen.shape - screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)] - view_width = int(screen_width * 0.6) - cart_location = get_cart_location(env,screen_width) - if cart_location < view_width // 2: - slice_range = slice(view_width) - elif cart_location > (screen_width - view_width // 2): - slice_range = slice(-view_width, None) - else: - slice_range = slice(cart_location - view_width // 2, - cart_location + view_width // 2) - # Strip off the edges, so that we have a square image centered on a cart - screen = screen[:, :, slice_range] - # Convert to float, rescale, convert to torch tensor - # (this doesn't require a copy) - screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 - screen = torch.from_numpy(screen) - # Resize, and add a batch dimension (BCHW) - return resize(screen).unsqueeze(0).to(device) - -if __name__ == "__main__": - - import gym - env = gym.make('CartPole-v0').unwrapped - # if gpu is to be used - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - env.reset() - import matplotlib.pyplot as plt - - plt.figure() - plt.imshow(get_screen(env,device).cpu().squeeze(0).permute(1, 2, 0).numpy(), - interpolation='none') - plt.title('Example extracted screen') - plt.show() \ No newline at end of file diff --git a/codes/DQN_cnn/main.py b/codes/DQN_cnn/main.py deleted file mode 100644 index 89f9d77..0000000 --- a/codes/DQN_cnn/main.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 10:01:09 -@LastEditor: John -LastEditTime: 2021-04-05 11:06:23 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(__file__) -parent_path=os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import gym -import torch -import datetime -from DQN_cnn.env import get_screen -from DQN_cnn.agent import DQNcnn -from common.plot import plot_rewards -from common.utils import save_results - -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time -SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model -if not os.path.exists(curr_path+"/saved_model/"): - os.mkdir(curr_path+"/saved_model/") -if not os.path.exists(SAVED_MODEL_PATH): - os.mkdir(SAVED_MODEL_PATH) -RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards -if not os.path.exists(curr_path+"/results/"): - os.mkdir(curr_path+"/results/") -if not os.path.exists(RESULT_PATH): - os.mkdir(RESULT_PATH) - -class DQNcnnConfig: - def __init__(self) -> None: - self.algo = "DQN_cnn" # name of algo - self.gamma = 0.99 - self.epsilon_start = 0.95 # e-greedy策略的初始epsilon - self.epsilon_end = 0.05 - self.epsilon_decay = 200 - self.lr = 0.01 # leanring rate - self.memory_capacity = 10000 # Replay Memory容量 - self.batch_size = 64 - self.train_eps = 250 # 训练的episode数目 - self.train_steps = 200 # 训练每个episode的最大长度 - self.target_update = 4 # target net的更新频率 - self.eval_eps = 20 # 测试的episode数目 - self.eval_steps = 200 # 测试每个episode的最大长度 - self.hidden_dim = 128 # 神经网络隐藏层维度 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # if gpu is to be used - -def train(cfg, env, agent): - rewards = [] - ma_rewards = [] - for i_episode in range(cfg.train_eps): - # Initialize the environment and state - env.reset() - last_screen = get_screen(env, cfg.device) - current_screen = get_screen(env, cfg.device) - state = current_screen - last_screen - ep_reward = 0 - for i_step in range(cfg.train_steps+1): - # Select and perform an action - action = agent.choose_action(state) - _, reward, done, _ = env.step(action.item()) - ep_reward += reward - reward = torch.tensor([reward], device=cfg.device) - # Observe new state - last_screen = current_screen - current_screen = get_screen(env, cfg.device) - if done: - break - state_ = current_screen - last_screen - # Store the transition in memory - agent.memory.push(state, action, state_, reward) - # Move to the next state - state = state_ - # Perform one step of the optimization (on the target network) - agent.update() - # Update the target network, copying all weights and biases in DQN - if i_episode % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:{}/{}, Reward:{}, Steps:{}, Explore:{:.2f}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,agent.epsilon,done)) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - return rewards,ma_rewards - - -if __name__ == "__main__": - cfg = DQNcnnConfig() - # Get screen size so that we can initialize layers correctly based on shape - # returned from AI gym. Typical dimensions at this point are close to 3x40x90 - # which is the result of a clamped and down-scaled render buffer in get_screen(env,device) - # 因为这里环境的state需要从默认的向量改为图像,所以要unwrapped更改state - env = gym.make('CartPole-v0').unwrapped - env.reset() - init_screen = get_screen(env, cfg.device) - _, _, screen_height, screen_width = init_screen.shape - # Get number of actions from gym action space - action_dim = env.action_space.n - agent = DQNcnn(screen_height, screen_width, - action_dim, cfg) - rewards,ma_rewards = train(cfg,env,agent) - save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) - plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) diff --git a/codes/DQN_cnn/memory.py b/codes/DQN_cnn/memory.py deleted file mode 100644 index 7359a0c..0000000 --- a/codes/DQN_cnn/memory.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 09:42:44 -@LastEditor: John -LastEditTime: 2021-03-23 20:38:41 -@Discription: -@Environment: python 3.7.7 -''' -from collections import namedtuple -import random - -class ReplayBuffer(object): - - def __init__(self, capacity): - self.capacity = capacity - self.buffer = [] - self.position = 0 - self.Transition = namedtuple('Transition', - ('state', 'action', 'state_', 'reward')) - - def push(self, *args): - """Saves a transition.""" - if len(self.buffer) < self.capacity: - self.buffer.append(None) - self.buffer[self.position] = self.Transition(*args) - self.position = (self.position + 1) % self.capacity - - def sample(self, batch_size): - return random.sample(self.buffer, batch_size) - - def __len__(self): - return len(self.buffer) diff --git a/codes/DQN_cnn/model.py b/codes/DQN_cnn/model.py deleted file mode 100644 index 71e67ca..0000000 --- a/codes/DQN_cnn/model.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 12:18:12 -@LastEditor: John -@LastEditTime: 2020-06-11 17:23:45 -@Discription: -@Environment: python 3.7.7 -''' -import torch.nn as nn -import torch.nn.functional as F - -class CNN(nn.Module): - - def __init__(self, h, w, n_outputs): - super(CNN, self).__init__() - self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2) - self.bn1 = nn.BatchNorm2d(16) - self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) - self.bn2 = nn.BatchNorm2d(32) - self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) - self.bn3 = nn.BatchNorm2d(32) - - # Number of Linear input connections depends on output of conv2d layers - # and therefore the input image size, so compute it. - def conv2d_size_out(size, kernel_size = 5, stride = 2): - return (size - (kernel_size - 1) - 1) // stride + 1 - convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) - convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) - linear_input_size = convw * convh * 32 - self.head = nn.Linear(linear_input_size, n_outputs) - - # Called with either one element to determine next action, or a batch - # during optimization. Returns tensor([[left0exp,right0exp]...]). - def forward(self, x): - x = F.relu(self.bn1(self.conv1(x))) - x = F.relu(self.bn2(self.conv2(x))) - x = F.relu(self.bn3(self.conv3(x))) - return self.head(x.view(x.size(0), -1)) \ No newline at end of file diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/agent.py index 34774c4..35c7c76 100644 --- a/codes/DoubleDQN/agent.py +++ b/codes/DoubleDQN/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2021-03-28 11:07:35 +LastEditTime: 2021-05-04 15:04:45 @Discription: @Environment: python 3.7.7 ''' @@ -42,15 +42,8 @@ class DoubleDQN: self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) self.loss = 0 self.memory = ReplayBuffer(cfg.memory_capacity) - - def choose_action(self, state): - '''选择动作 - ''' - self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ - math.exp(-1. * self.actions_count / self.epsilon_decay) - self.actions_count += 1 - if random.random() > self.epsilon: - with torch.no_grad(): + def predict(self,state): + with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( @@ -61,6 +54,15 @@ class DoubleDQN: # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() + return action + def choose_action(self, state): + '''选择动作 + ''' + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ + math.exp(-1. * self.actions_count / self.epsilon_decay) + self.actions_count += 1 + if random.random() > self.epsilon: + action = self.predict(state) else: action = random.randrange(self.action_dim) return action @@ -113,7 +115,9 @@ class DoubleDQN: self.optimizer.step() # 更新模型 def save(self,path): - torch.save(self.target_net.state_dict(), path+'DoubleDQN_checkpoint.pth') + torch.save(self.target_net.state_dict(), path+'checkpoint.pth') def load(self,path): - self.target_net.load_state_dict(torch.load(path+'DoubleDQN_checkpoint.pth')) + self.target_net.load_state_dict(torch.load(path+'checkpoint.pth')) + for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): + param.data.copy_(target_param.data) diff --git a/codes/DoubleDQN/main.py b/codes/DoubleDQN/main.py deleted file mode 100644 index 57c9f9c..0000000 --- a/codes/DoubleDQN/main.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-12 00:48:57 -@LastEditor: John -LastEditTime: 2021-03-28 11:05:14 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -sys.path.append(os.getcwd()) # add current terminal path -import gym -import torch -import datetime -from DoubleDQN.agent import DoubleDQN -from common.plot import plot_rewards -from common.utils import save_results - -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 -if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): - os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") -if not os.path.exists(SAVED_MODEL_PATH): - os.mkdir(SAVED_MODEL_PATH) -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 -if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): - os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") -if not os.path.exists(RESULT_PATH): - os.mkdir(RESULT_PATH) - -class DoubleDQNConfig: - def __init__(self): - self.algo = "Double DQN" # name of algo - self.gamma = 0.99 - self.epsilon_start = 0.9 # e-greedy策略的初始epsilon - self.epsilon_end = 0.01 - self.epsilon_decay = 200 - self.lr = 0.01 # 学习率 - self.memory_capacity = 10000 # Replay Memory容量 - self.batch_size = 128 - self.train_eps = 300 # 训练的episode数目 - self.train_steps = 200 # 训练每个episode的最大长度 - self.target_update = 2 # target net的更新频率 - self.eval_eps = 20 # 测试的episode数目 - self.eval_steps = 200 # 测试每个episode的最大长度 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - self.hidden_dim = 128 # 神经网络隐藏层维度 - - -def train(cfg,env,agent): - print('Start to train !') - rewards,ma_rewards = [],[] - ep_steps = [] - for i_episode in range(cfg.train_eps): - state = env.reset() # reset环境状态 - ep_reward = 0 - for i_step in range(cfg.train_steps): - action = agent.choose_action(state) # 根据当前环境state选择action - next_state, reward, done, _ = env.step(action) # 更新环境参数 - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory - state = next_state # 跳转到下一个状态 - agent.update() # 每步更新网络 - if done: - break - # 更新target network,复制DQN中的所有weights and biases - if i_episode % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step,done)) - ep_steps.append(i_step) - rewards.append(ep_reward) - # 计算滑动窗口的reward - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = DoubleDQNConfig() - env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 - env.seed(1) # 设置env随机种子 - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = DoubleDQN(state_dim,action_dim,cfg) - rewards,ma_rewards = train(cfg,env,agent) - agent.save(path=SAVED_MODEL_PATH) - save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) - plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth new file mode 100644 index 0000000..8c4b561 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy new file mode 100644 index 0000000..0f77696 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy new file mode 100644 index 0000000..57f8759 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png new file mode 100644 index 0000000..038e031 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy new file mode 100644 index 0000000..63d10e7 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy new file mode 100644 index 0000000..d486ad9 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png new file mode 100644 index 0000000..f91bc4d Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png differ diff --git a/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy b/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy deleted file mode 100644 index 1c4be2b..0000000 Binary files a/codes/DoubleDQN/results/20210328-110516/ma_rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png b/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png deleted file mode 100644 index 2817223..0000000 Binary files a/codes/DoubleDQN/results/20210328-110516/rewards_curve_train.png and /dev/null differ diff --git a/codes/DoubleDQN/results/20210328-110516/rewards_train.npy b/codes/DoubleDQN/results/20210328-110516/rewards_train.npy deleted file mode 100644 index 73acfde..0000000 Binary files a/codes/DoubleDQN/results/20210328-110516/rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth b/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth deleted file mode 100644 index 69f5fce..0000000 Binary files a/codes/DoubleDQN/saved_model/20210328-110516/DoubleDQN_checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/task0_train.py b/codes/DoubleDQN/task0_train.py new file mode 100644 index 0000000..3819c04 --- /dev/null +++ b/codes/DoubleDQN/task0_train.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +@Author: John +@Email: johnjim0816@gmail.com +@Date: 2020-06-12 00:48:57 +@LastEditor: John +LastEditTime: 2021-05-04 15:05:37 +@Discription: +@Environment: python 3.7.7 +''' +import sys,os +curr_path = os.path.dirname(__file__) +parent_path = os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + +import gym +import torch +import datetime +from DoubleDQN.agent import DoubleDQN +from common.plot import plot_rewards +from common.utils import save_results, make_dir + +curr_time = datetime.datetime.now().strftime( + "%Y%m%d-%H%M%S") # obtain current time + +class DoubleDQNConfig: + def __init__(self): + self.algo = "DoubleDQN" # name of algo + self.env = 'CartPole-v0' # env name + self.result_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/models/' # path to save results + self.gamma = 0.99 + self.epsilon_start = 0.9 # start epsilon of e-greedy policy + self.epsilon_end = 0.01 + self.epsilon_decay = 200 + self.lr = 0.01 # learning rate + self.memory_capacity = 10000 # capacity of Replay Memory + self.batch_size = 128 + self.train_eps = 300 # max tranng episodes + self.train_steps = 200 # max training steps per episode + self.target_update = 2 # update frequency of target net + self.eval_eps = 50 # max evaling episodes + self.eval_steps = 200 # max evaling steps per episode + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check gpu + self.hidden_dim = 128 # hidden size of net + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = DoubleDQN(state_dim,action_dim,cfg) + return env,agent + +def train(cfg,env,agent): + print('Start to train !') + rewards,ma_rewards = [],[] + for i_ep in range(cfg.train_eps): + state = env.reset() # reset环境状态 + ep_reward = 0 + while True: + action = agent.choose_action(state) # 根据当前环境state选择action + next_state, reward, done, _ = env.step(action) # 更新环境参数 + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory + state = next_state # 跳转到下一个状态 + agent.update() # 每步更新网络 + if done: + break + if i_ep % cfg.target_update == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}') + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('Complete training!') + return rewards,ma_rewards + +def eval(cfg,env,agent): + rewards = [] + ma_rewards = [] + for i_ep in range(cfg.eval_eps): + state = env.reset() + ep_reward = 0 + while True: + action = agent.predict(state) + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}") + return rewards,ma_rewards +if __name__ == "__main__": + cfg = DoubleDQNConfig() + env,agent = env_agent_config(cfg,seed=1) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(cfg.result_path, cfg.model_path) + agent.save(path=cfg.model_path) + save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) + plot_rewards(rewards, ma_rewards, tag="train", + algo=cfg.algo, path=cfg.result_path) + + env,agent = env_agent_config(cfg,seed=10) + agent.load(path=cfg.model_path) + rewards,ma_rewards = eval(cfg,env,agent) + save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) + plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) diff --git a/codes/DoubleDQN/utils.py b/codes/DoubleDQN/utils.py deleted file mode 100644 index c5f5305..0000000 --- a/codes/DoubleDQN/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-10-15 21:28:00 -LastEditor: John -LastEditTime: 2020-10-15 21:50:30 -Discription: -Environment: -''' -import os -import numpy as np - - -def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./results'): - if not os.path.exists(result_path): # 检测是否存在文件夹 - os.mkdir(result_path) - np.save(result_path+'rewards_'+tag+'.npy', rewards) - np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards) - np.save(result_path+'steps_'+tag+'.npy',ep_steps ) \ No newline at end of file diff --git a/codes/README_en.md b/codes/README_en.md index 5e9a30c..5b2f707 100644 --- a/codes/README_en.md +++ b/codes/README_en.md @@ -21,9 +21,6 @@ Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in diff python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0 ## Usage -运行带有```train```的py文件或ipynb文件进行训练,如果前面带有```task```如```task0_train.py```,表示对task0任务训练 -类似的带有```eval```即为测试。 - run python scripts or jupyter notebook file with ```train``` to train the agent, if there is a ```task``` like ```task0_train.py```, it means to train with task 0. similar to file with ```eval```, which means to evaluate the agent. @@ -36,7 +33,7 @@ similar to file with ```eval```, which means to evaluate the agent. | [Q-Learning](./QLearning) | [towardsdatascience blog](https://towardsdatascience.com/simple-reinforcement-learning-q-learning-fcddc4b6fe56),[q learning paper](https://ieeexplore.ieee.org/document/8836506) | [CliffWalking-v0](./envs/gym_info.md) | | | [Sarsa](./Sarsa) | [geeksforgeeks blog](https://www.geeksforgeeks.org/sarsa-reinforcement-learning/) | [Racetrack](./envs/racetrack_env.md) | | | [DQN](./DQN) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),[Nature DQN Paper](https://www.nature.com/articles/nature14236) | [CartPole-v0](./envs/gym_info.md) | | -| [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 | +| [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | | | [DoubleDQN](./DoubleDQN) | [DoubleDQN Paper](https://arxiv.org/abs/1509.06461) | [CartPole-v0](./envs/gym_info.md) | | | [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | | | [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | | diff --git a/codes/common/model.py b/codes/common/model.py index 41785fd..257c33b 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 21:14:12 LastEditor: John -LastEditTime: 2021-03-31 13:49:06 +LastEditTime: 2021-05-04 02:45:27 Discription: Environment: ''' @@ -63,7 +63,7 @@ class Actor(nn.Module): def forward(self, x): x = F.relu(self.linear1(x)) x = F.relu(self.linear2(x)) - x = F.tanh(self.linear3(x)) + x = torch.tanh(self.linear3(x)) return x class ActorCritic(nn.Module): diff --git a/codes/test.py b/codes/test.py deleted file mode 100644 index 90ccec4..0000000 --- a/codes/test.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-03-25 23:25:15 -LastEditor: JiangJi -LastEditTime: 2021-04-28 21:36:50 -Discription: -Environment: -''' -import random -dic = {0:"鳗鱼家",1:"一心",2:"bada"} -print("0:鳗鱼家,1:一心,2:bada") -print("三次随机,取最后一次选择") -for i in range(3): - if i ==2: - print(f"去{dic[random.randint(0,2)]}") - else: - print(f"不去{dic[random.randint(0,2)]}") \ No newline at end of file