diff --git a/codes/DQN/main.py b/codes/DQN/main.py index 19285d4..2478050 100644 --- a/codes/DQN/main.py +++ b/codes/DQN/main.py @@ -5,68 +5,78 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2021-04-13 19:03:39 +LastEditTime: 2021-04-18 14:44:45 @Discription: @Environment: python 3.7.7 ''' -import sys,os -curr_path = os.path.dirname(__file__) -parent_path=os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import gym -import torch -import datetime -from DQN.agent import DQN +from common.utils import save_results, make_dir, del_empty_dir from common.plot import plot_rewards -from common.utils import save_results,make_dir,del_empty_dir +from DQN.agent import DQN +import datetime +import torch +import gym +import sys +import os +curr_path = os.path.dirname(__file__) +parent_path = os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + + +curr_time = datetime.datetime.now().strftime( + "%Y%m%d-%H%M%S") # obtain current time -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time class DQNConfig: def __init__(self): self.algo = "DQN" # name of algo self.env = 'CartPole-v0' - self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/' # path to save results + self.result_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/outputs/" + self.env + \ + '/'+curr_time+'/models/' # path to save results self.gamma = 0.95 - self.epsilon_start = 1 # e-greedy策略的初始epsilon + self.epsilon_start = 1 # e-greedy策略的初始epsilon self.epsilon_end = 0.01 self.epsilon_decay = 500 - self.lr = 0.0001 # learning rate - self.memory_capacity = 10000 # Replay Memory容量 + self.lr = 0.0001 # learning rate + self.memory_capacity = 10000 # Replay Memory容量 self.batch_size = 32 - self.train_eps = 10 # 训练的episode数目 - self.target_update = 2 # target net的更新频率 - self.eval_eps = 20 # 测试的episode数目 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - self.hidden_dim = 256 # 神经网络隐藏层维度 - -def train(cfg,env,agent): + self.train_eps = 300 # 训练的episode数目 + self.target_update = 2 # target net的更新频率 + self.eval_eps = 20 # 测试的episode数目 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu + self.hidden_dim = 256 # 神经网络隐藏层维度 + + +def train(cfg, env, agent): print('Start to train !') + print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') rewards = [] - ma_rewards = [] # moveing average reward + ma_rewards = [] # moveing average reward for i_episode in range(cfg.train_eps): - state = env.reset() + state = env.reset() done = False ep_reward = 0 while not done: - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() + agent.memory.push(state, action, reward, next_state, done) + state = next_state + agent.update() if i_episode % cfg.target_update == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward)) + print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) # 计算滑动窗口的reward if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: - ma_rewards.append(ep_reward) + ma_rewards.append(ep_reward) print('Complete training!') - return rewards,ma_rewards + return rewards, ma_rewards + if __name__ == "__main__": cfg = DQNConfig() @@ -74,9 +84,10 @@ if __name__ == "__main__": env.seed(1) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n - agent = DQN(state_dim,action_dim,cfg) - rewards,ma_rewards = train(cfg,env,agent) - make_dir(cfg.result_path) - agent.save(path=cfg.result_path) - save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=cfg.result_path) \ No newline at end of file + agent = DQN(state_dim, action_dim, cfg) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(cfg.result_path, cfg.model_path) + agent.save(path=cfg.model_path) + save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) + plot_rewards(rewards, ma_rewards, tag="train", + algo=cfg.algo, path=cfg.result_path) diff --git a/codes/DQN/outputs/CartPole-v0/20210418-143542/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20210418-143542/models/dqn_checkpoint.pth new file mode 100644 index 0000000..3bc041d Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210418-143542/models/dqn_checkpoint.pth differ diff --git a/codes/DQN/outputs/CartPole-v0/20210418-143542/results/ma_rewards_train.npy b/codes/DQN/outputs/CartPole-v0/20210418-143542/results/ma_rewards_train.npy new file mode 100644 index 0000000..152ad7a Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210418-143542/results/ma_rewards_train.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210418-143542/results/rewards_curve_train.png b/codes/DQN/outputs/CartPole-v0/20210418-143542/results/rewards_curve_train.png new file mode 100644 index 0000000..ad42573 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210418-143542/results/rewards_curve_train.png differ diff --git a/codes/DQN/outputs/CartPole-v0/20210418-143542/results/rewards_train.npy b/codes/DQN/outputs/CartPole-v0/20210418-143542/results/rewards_train.npy new file mode 100644 index 0000000..58fb2b8 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210418-143542/results/rewards_train.npy differ diff --git a/codes/DQN/results/CartPole-v0/20210413-185605/dqn_checkpoint.pth b/codes/DQN/results/CartPole-v0/20210413-185605/dqn_checkpoint.pth deleted file mode 100644 index 067e6f4..0000000 Binary files a/codes/DQN/results/CartPole-v0/20210413-185605/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/results/CartPole-v0/20210413-185605/ma_rewards_train.npy b/codes/DQN/results/CartPole-v0/20210413-185605/ma_rewards_train.npy deleted file mode 100644 index 5e8948d..0000000 Binary files a/codes/DQN/results/CartPole-v0/20210413-185605/ma_rewards_train.npy and /dev/null differ diff --git a/codes/DQN/results/CartPole-v0/20210413-185605/rewards_curve_train.png b/codes/DQN/results/CartPole-v0/20210413-185605/rewards_curve_train.png deleted file mode 100644 index 8106ccf..0000000 Binary files a/codes/DQN/results/CartPole-v0/20210413-185605/rewards_curve_train.png and /dev/null differ diff --git a/codes/DQN/results/CartPole-v0/20210413-185605/rewards_train.npy b/codes/DQN/results/CartPole-v0/20210413-185605/rewards_train.npy deleted file mode 100644 index c7c0370..0000000 Binary files a/codes/DQN/results/CartPole-v0/20210413-185605/rewards_train.npy and /dev/null differ diff --git a/codes/DQN/task1.py b/codes/DQN/task1.py deleted file mode 100644 index e274bcc..0000000 --- a/codes/DQN/task1.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-12 00:48:57 -@LastEditor: John -LastEditTime: 2021-04-13 18:49:44 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(__file__) -parent_path=os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import gym -import torch -import datetime -from DQN.agent import DQN -from common.plot import plot_rewards -from common.utils import save_results,make_dir,del_empty_dir - -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time -SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model -RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards -make_dir(curr_path+"/saved_model/",curr_path+"/results/") -del_empty_dir(curr_path+"/saved_model/",curr_path+"/results/") - -class DQNConfig: - def __init__(self): - self.env = 'LunarLander-v2' - self.algo = "DQN" # name of algo - self.gamma = 0.95 - self.epsilon_start = 1 # e-greedy策略的初始epsilon - self.epsilon_end = 0.01 - self.epsilon_decay = 500 - self.lr = 0.0001 # learning rate - self.memory_capacity = 1000000 # Replay Memory容量 - self.batch_size = 64 - self.train_eps = 300 # 训练的episode数目 - self.train_steps = 1000 - self.target_update = 2 # target net的更新频率 - self.eval_eps = 20 # 测试的episode数目 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - self.hidden_dim = 256 # 神经网络隐藏层维度 - -def train(cfg,env,agent): - print('Start to train !') - rewards = [] - ma_rewards = [] # moveing average reward - for i_episode in range(cfg.train_eps): - state = env.reset() - ep_reward = 0 - for i_step in range(cfg.train_steps): - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() - if done: - break - if i_episode % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward)) - rewards.append(ep_reward) - # 计算滑动窗口的reward - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = DQNConfig() - env = gym.make(cfg.env) - env.seed(1) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = DQN(state_dim,action_dim,cfg) - rewards,ma_rewards = train(cfg,env,agent) - make_dir(SAVED_MODEL_PATH,RESULT_PATH) - agent.save(path=SAVED_MODEL_PATH) - save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) - plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) - del_empty_dir(SAVED_MODEL_PATH,RESULT_PATH) \ No newline at end of file diff --git a/codes/PPO/agent.py b/codes/PPO/agent.py index 7e8c0f6..c0bfd0c 100644 --- a/codes/PPO/agent.py +++ b/codes/PPO/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-23 15:17:42 LastEditor: John -LastEditTime: 2021-04-11 01:24:24 +LastEditTime: 2021-04-28 10:11:09 Discription: Environment: ''' @@ -17,7 +17,6 @@ from PPO.model import Actor,Critic from PPO.memory import PPOMemory class PPO: def __init__(self, state_dim, action_dim,cfg): - self.env = cfg.env self.gamma = cfg.gamma self.policy_clip = cfg.policy_clip self.n_epochs = cfg.n_epochs @@ -84,13 +83,13 @@ class PPO: self.critic_optimizer.step() self.memory.clear() def save(self,path): - actor_checkpoint = os.path.join(path, self.env+'_actor.pt') - critic_checkpoint= os.path.join(path, self.env+'_critic.pt') + actor_checkpoint = os.path.join(path, 'ppo_actor.pt') + critic_checkpoint= os.path.join(path, 'ppo_critic.pt') torch.save(self.actor.state_dict(), actor_checkpoint) torch.save(self.critic.state_dict(), critic_checkpoint) def load(self,path): - actor_checkpoint = os.path.join(path, self.env+'_actor.pt') - critic_checkpoint= os.path.join(path, self.env+'_critic.pt') + actor_checkpoint = os.path.join(path, 'ppo_actor.pt') + critic_checkpoint= os.path.join(path, 'ppo_critic.pt') self.actor.load_state_dict(torch.load(actor_checkpoint)) self.critic.load_state_dict(torch.load(critic_checkpoint)) diff --git a/codes/PPO/main.py b/codes/PPO/main.py index 18f4a62..85febef 100644 --- a/codes/PPO/main.py +++ b/codes/PPO/main.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-22 16:18:10 LastEditor: John -LastEditTime: 2021-04-11 01:24:41 +LastEditTime: 2021-04-28 10:13:00 Discription: Environment: ''' @@ -19,24 +19,16 @@ import torch import datetime from PPO.agent import PPO from common.plot import plot_rewards -from common.utils import save_results +from common.utils import save_results,make_dir -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 -if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹 - os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") -if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 - os.mkdir(SAVED_MODEL_PATH) -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 -if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹 - os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") -if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 - os.mkdir(RESULT_PATH) +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time class PPOConfig: def __init__(self) -> None: self.env = 'CartPole-v0' self.algo = 'PPO' + self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models self.batch_size = 5 self.gamma=0.99 self.n_epochs = 4 @@ -50,12 +42,10 @@ class PPOConfig: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check gpu def train(cfg,env,agent): - best_reward = env.reward_range[0] rewards= [] ma_rewards = [] # moving average rewards - avg_reward = 0 running_steps = 0 - for i_episode in range(cfg.train_eps): + for i_ep in range(cfg.train_eps): state = env.reset() done = False ep_reward = 0 @@ -74,21 +64,18 @@ def train(cfg,env,agent): 0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - avg_reward = np.mean(rewards[-100:]) - if avg_rewardself.actor_lr = 0.002 - self.critic_lr = 0.005 > best_reward: - best_reward = avg_reward - agent.save(path=SAVED_MODEL_PATH) - print('Episode:{}/{}, Reward:{:.1f}, avg reward:{:.1f}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,avg_reward,done)) + print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}") return rewards,ma_rewards if __name__ == '__main__': - cfg = PPOConfig() + cfg = PPOConfig() env = gym.make(cfg.env) - env.seed(1) + env.seed(1) # Set seeds state_dim=env.observation_space.shape[0] action_dim=env.action_space.n agent = PPO(state_dim,action_dim,cfg) rewards,ma_rewards = train(cfg,env,agent) - save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) - plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) \ No newline at end of file + make_dir(cfg.result_path,cfg.model_path) + agent.save(path=cfg.model_path) + save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) + plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) \ No newline at end of file diff --git a/codes/PPO/results/20210411-010116/ma_rewards_train.npy b/codes/PPO/results/20210411-010116/ma_rewards_train.npy deleted file mode 100644 index 73e1b88..0000000 Binary files a/codes/PPO/results/20210411-010116/ma_rewards_train.npy and /dev/null differ diff --git a/codes/PPO/results/20210411-010116/rewards_curve_train.png b/codes/PPO/results/20210411-010116/rewards_curve_train.png deleted file mode 100644 index 6118d73..0000000 Binary files a/codes/PPO/results/20210411-010116/rewards_curve_train.png and /dev/null differ diff --git a/codes/PPO/results/20210411-010116/rewards_train.npy b/codes/PPO/results/20210411-010116/rewards_train.npy deleted file mode 100644 index 690f450..0000000 Binary files a/codes/PPO/results/20210411-010116/rewards_train.npy and /dev/null differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101400/models/ppo_actor.pt b/codes/PPO/results/CartPole-v0/20210428-101400/models/ppo_actor.pt new file mode 100644 index 0000000..252815d Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101400/models/ppo_actor.pt differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101400/models/ppo_critic.pt b/codes/PPO/results/CartPole-v0/20210428-101400/models/ppo_critic.pt new file mode 100644 index 0000000..a67f3eb Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101400/models/ppo_critic.pt differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101400/results/ma_rewards_train.npy b/codes/PPO/results/CartPole-v0/20210428-101400/results/ma_rewards_train.npy new file mode 100644 index 0000000..3772867 Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101400/results/ma_rewards_train.npy differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101400/results/rewards_curve_train.png b/codes/PPO/results/CartPole-v0/20210428-101400/results/rewards_curve_train.png new file mode 100644 index 0000000..378779d Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101400/results/rewards_curve_train.png differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101400/results/rewards_train.npy b/codes/PPO/results/CartPole-v0/20210428-101400/results/rewards_train.npy new file mode 100644 index 0000000..af131b9 Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101400/results/rewards_train.npy differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101634/models/ppo_actor.pt b/codes/PPO/results/CartPole-v0/20210428-101634/models/ppo_actor.pt new file mode 100644 index 0000000..516e740 Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101634/models/ppo_actor.pt differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101634/models/ppo_critic.pt b/codes/PPO/results/CartPole-v0/20210428-101634/models/ppo_critic.pt new file mode 100644 index 0000000..489e43d Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101634/models/ppo_critic.pt differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101634/results/ma_rewards_train.npy b/codes/PPO/results/CartPole-v0/20210428-101634/results/ma_rewards_train.npy new file mode 100644 index 0000000..70dc625 Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101634/results/ma_rewards_train.npy differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101634/results/rewards_curve_train.png b/codes/PPO/results/CartPole-v0/20210428-101634/results/rewards_curve_train.png new file mode 100644 index 0000000..9c31971 Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101634/results/rewards_curve_train.png differ diff --git a/codes/PPO/results/CartPole-v0/20210428-101634/results/rewards_train.npy b/codes/PPO/results/CartPole-v0/20210428-101634/results/rewards_train.npy new file mode 100644 index 0000000..477be61 Binary files /dev/null and b/codes/PPO/results/CartPole-v0/20210428-101634/results/rewards_train.npy differ diff --git a/codes/PPO/saved_model/20210411-010116/CartPole-v0_actor.pt b/codes/PPO/saved_model/20210411-010116/CartPole-v0_actor.pt deleted file mode 100644 index 7218d36..0000000 Binary files a/codes/PPO/saved_model/20210411-010116/CartPole-v0_actor.pt and /dev/null differ diff --git a/codes/PPO/saved_model/20210411-010116/CartPole-v0_critic.pt b/codes/PPO/saved_model/20210411-010116/CartPole-v0_critic.pt deleted file mode 100644 index fc45870..0000000 Binary files a/codes/PPO/saved_model/20210411-010116/CartPole-v0_critic.pt and /dev/null differ diff --git a/codes/PolicyGradient/README.md b/codes/PolicyGradient/README.md index cc6edf3..0f9fec3 100644 --- a/codes/PolicyGradient/README.md +++ b/codes/PolicyGradient/README.md @@ -1,14 +1,18 @@ # Policy Gradient -实现的是Policy Gradient最基本的REINFORCE方法 -## 使用说明 -直接运行```main.py```即可 -## 原理讲解 -参考我的博客[Policy Gradient算法实战](https://blog.csdn.net/JohnJim0/article/details/110236851) -## 环境 -python 3.7.9、pytorch 1.6.0 -## 程序运行方法 +Policy-based方法是强化学习中与Value-based(比如Q-learning)相对的方法,其目的是对策略本身进行梯度下降,相关基础知识参考[Datawhale-Policy Gradient](https://datawhalechina.github.io/leedeeprl-notes/#/chapter4/chapter4)。 +其中REINFORCE是一个最基本的Policy Gradient方法,主要解决策略梯度无法直接计算的问题,具体原理参考[CSDN-REINFORCE和Reparameterization Trick](https://blog.csdn.net/JohnJim0/article/details/110230703) + +## 伪代码 + +结合REINFORCE原理,其伪代码如下: + +![img](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210428001336032.png) + +## 实现 + + ## 参考 diff --git a/codes/PolicyGradient/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210428001336032.png b/codes/PolicyGradient/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210428001336032.png new file mode 100644 index 0000000..44c1874 Binary files /dev/null and b/codes/PolicyGradient/assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210428001336032.png differ diff --git a/codes/README.md b/codes/README.md index 5b2104b..fd86303 100644 --- a/codes/README.md +++ b/codes/README.md @@ -22,7 +22,9 @@ python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0 ## 使用说明 -运行```main.py```或者```main.ipynb```,或者包含```task```名的文件(比如```task1.py```) +运行带有```train```的py文件或ipynb文件进行训练,如果前面带有```task```如```task0_train.py```,表示对task0任务训练 +类似的带有```eval```即为测试。 + ## 算法进度 | 算法名称 | 相关论文材料 | 环境 | 备注 | @@ -45,11 +47,8 @@ python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0 - ## Refs - [RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2) [RL-Adventure](https://github.com/higgsfield/RL-Adventure) - diff --git a/codes/README_en.md b/codes/README_en.md index a084e81..f3a95d6 100644 --- a/codes/README_en.md +++ b/codes/README_en.md @@ -19,10 +19,14 @@ Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in diff ## Runnig Environment -python 3.7.9、pytorch 1.6.0、gym 0.18.0 +python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0 ## Usage +运行带有```train```的py文件或ipynb文件进行训练,如果前面带有```task```如```task0_train.py```,表示对task0任务训练 +类似的带有```eval```即为测试。 -run ```main.py``` or ```main.ipynb```, or run files with ```task```(like ```task1.py```) +run python scripts or jupyter notebook file with ```train``` to train the agent, if there is a ```task``` like ```task0_train.py```, it means to train with task 0. + +similar to file with ```eval```, which means to evaluate the agent. ## Schedule @@ -51,5 +55,3 @@ run ```main.py``` or ```main.ipynb```, or run files with ```task```(like ```task [RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2) [RL-Adventure](https://github.com/higgsfield/RL-Adventure) - - diff --git a/codes/RandomPolicy/main.py b/codes/RandomPolicy/main.py new file mode 100644 index 0000000..897cc35 --- /dev/null +++ b/codes/RandomPolicy/main.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-04-21 11:07:57 +LastEditor: JiangJi +LastEditTime: 2021-04-21 11:15:00 +Discription: +Environment: +''' +import sys,os +curr_path = os.path.dirname(__file__) +parent_path=os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + +import torch +import gym +import numpy as np +import datetime + +from common.plot import plot_rewards +from common.utils import save_results,make_dir + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time + + +class TD3Config: + def __init__(self) -> None: + self.algo = 'TD3' + self.env = 'HalfCheetah-v2' + self.seed = 0 + self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models + self.eval_freq = 5e3 # How often (time steps) we evaluate + # self.train_eps = 800 + self.max_timestep = 4000000 # Max time steps to run environment + +# Runs policy for X episodes and returns average reward +# A fixed seed is used for the eval environment +def eval(env_name,seed, eval_episodes=10): + eval_env = gym.make(env_name) + eval_env.seed(seed + 100) + avg_reward = 0. + for _ in range(eval_episodes): + state, done = eval_env.reset(), False + while not done: + # eval_env.render() + action = eval_env.action_space.sample() + state, reward, done, _ = eval_env.step(action) + avg_reward += reward + avg_reward /= eval_episodes + print("---------------------------------------") + print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") + print("---------------------------------------") + return avg_reward + +def train(cfg,env): + # Evaluate untrained policy + evaluations = [eval(cfg.env, cfg.seed)] + state, done = env.reset(), False + ep_reward = 0 + ep_timesteps = 0 + episode_num = 0 + rewards = [] + ma_rewards = [] # moveing average reward + for t in range(int(cfg.max_timestep)): + ep_timesteps += 1 + # Select action randomly + action = env.action_space.sample() + # Perform action + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True + print(f"Episode:{episode_num+1}, Episode T:{ep_timesteps}, Reward:{ep_reward:.3f}") + # Reset environment + state, done = env.reset(), False + rewards.append(ep_reward) + # 计算滑动窗口的reward + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + ep_reward = 0 + ep_timesteps = 0 + episode_num += 1 + # Evaluate episode + if (t + 1) % cfg.eval_freq == 0: + evaluations.append(eval(cfg.env, cfg.seed)) + return rewards, ma_rewards + +if __name__ == "__main__": + cfg = TD3Config() + env = gym.make(cfg.env) + env.seed(cfg.seed) # Set seeds + torch.manual_seed(cfg.seed) + np.random.seed(cfg.seed) + rewards,ma_rewards = train(cfg,env) + make_dir(cfg.result_path) + save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) + plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/' + # agent.load(cfg.result_path) + # eval(cfg.env,agent, cfg.seed) + + diff --git a/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/ma_rewards_train.npy b/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/ma_rewards_train.npy new file mode 100644 index 0000000..d542658 Binary files /dev/null and b/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/ma_rewards_train.npy differ diff --git a/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/rewards_curve_train.png b/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/rewards_curve_train.png new file mode 100644 index 0000000..d54cee0 Binary files /dev/null and b/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/rewards_curve_train.png differ diff --git a/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/rewards_train.npy b/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/rewards_train.npy new file mode 100644 index 0000000..1fc3be1 Binary files /dev/null and b/codes/RandomPolicy/results/HalfCheetah-v2/20210421-111223/models/rewards_train.npy differ diff --git a/codes/TD3/agent.py b/codes/TD3/agent.py index bad7ab1..3d43700 100644 --- a/codes/TD3/agent.py +++ b/codes/TD3/agent.py @@ -92,14 +92,10 @@ class TD3(object): self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) self.memory = ReplayBuffer(state_dim, action_dim) - - - def choose_action(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) return self.actor(state).cpu().data.numpy().flatten() - def update(self): self.total_it += 1 @@ -167,4 +163,4 @@ class TD3(object): self.actor.load_state_dict(torch.load(path + "td3_actor")) self.actor_optimizer.load_state_dict(torch.load(path + "td3_actor_optimizer")) self.actor_target = copy.deepcopy(self.actor) - \ No newline at end of file + diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/ma_rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210416-003720/ma_rewards_train.npy deleted file mode 100644 index 3a60db5..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/ma_rewards_train.npy and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/rewards_curve_train.png b/codes/TD3/results/HalfCheetah-v2/20210416-003720/rewards_curve_train.png deleted file mode 100644 index 9a083b7..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/rewards_curve_train.png and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210416-003720/rewards_train.npy deleted file mode 100644 index fbbc319..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/rewards_train.npy and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_actor b/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_actor deleted file mode 100644 index e2103a8..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_actor and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_actor_optimizer b/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_actor_optimizer deleted file mode 100644 index 40fe482..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_actor_optimizer and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_critic b/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_critic deleted file mode 100644 index 18a90dc..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_critic and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_critic_optimizer b/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_critic_optimizer deleted file mode 100644 index 42bed4e..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210416-003720/td3_critic_optimizer and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor new file mode 100644 index 0000000..2b3b481 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer new file mode 100644 index 0000000..9bb6195 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic new file mode 100644 index 0000000..cccfb71 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer new file mode 100644 index 0000000..1446c66 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy new file mode 100644 index 0000000..96d40db Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png b/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png new file mode 100644 index 0000000..e310371 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_train.npy new file mode 100644 index 0000000..718e407 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_train.npy differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/ma_rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210421-004751/ma_rewards_train.npy new file mode 100644 index 0000000..9680e2a Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/ma_rewards_train.npy differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_curve_train.png b/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_curve_train.png new file mode 100644 index 0000000..a54dced Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_curve_train.png differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_train.npy new file mode 100644 index 0000000..d9f349d Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_train.npy differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor new file mode 100644 index 0000000..b154cc1 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor_optimizer b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor_optimizer new file mode 100644 index 0000000..28504b0 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor_optimizer differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic new file mode 100644 index 0000000..4bdff3f Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic_optimizer b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic_optimizer new file mode 100644 index 0000000..3fffdf3 Binary files /dev/null and b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic_optimizer differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor new file mode 100644 index 0000000..ae4dfb2 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor_optimizer b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor_optimizer new file mode 100644 index 0000000..42755e1 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor_optimizer differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic new file mode 100644 index 0000000..b6181a0 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic_optimizer b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic_optimizer new file mode 100644 index 0000000..b2ad60f Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic_optimizer differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/TD3_rewards_curve_eval.png b/codes/TD3/results/Pendulum-v0/20210428-092059/results/TD3_rewards_curve_eval.png new file mode 100644 index 0000000..ee4d1af Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/results/TD3_rewards_curve_eval.png differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_eval.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_eval.npy new file mode 100644 index 0000000..c2509fb Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_eval.npy differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_train.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_train.npy new file mode 100644 index 0000000..6181f14 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_train.npy differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_curve_train.png b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_curve_train.png new file mode 100644 index 0000000..14a7455 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_curve_train.png differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_eval.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_eval.npy new file mode 100644 index 0000000..80fd298 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_eval.npy differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_train.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_train.npy new file mode 100644 index 0000000..eadc498 Binary files /dev/null and b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_train.npy differ diff --git a/codes/TD3/task0_eval.py b/codes/TD3/task0_eval.py new file mode 100644 index 0000000..0420dce --- /dev/null +++ b/codes/TD3/task0_eval.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-04-23 20:36:23 +LastEditor: JiangJi +LastEditTime: 2021-04-23 20:37:22 +Discription: +Environment: +''' +import sys,os +curr_path = os.path.dirname(__file__) +parent_path=os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + +import torch +import gym +import numpy as np +import datetime + + +from TD3.agent import TD3 +from common.plot import plot_rewards +from common.utils import save_results,make_dir + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time + +class TD3Config: + def __init__(self) -> None: + self.algo = 'TD3 and Random' + self.env = 'HalfCheetah-v2' + self.seed = 0 + self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models + self.start_timestep = 25e3 # Time steps initial random policy is used + self.eval_freq = 5e3 # How often (time steps) we evaluate + self.max_timestep = 200000 # Max time steps to run environment + self.expl_noise = 0.1 # Std of Gaussian exploration noise + self.batch_size = 256 # Batch size for both actor and critic + self.gamma = 0.99 # gamma factor + self.lr = 0.0005 # Target network update rate + self.policy_noise = 0.2 # Noise added to target policy during critic update + self.noise_clip = 0.5 # Range to clip target policy noise + self.policy_freq = 2 # Frequency of delayed policy updates + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Runs policy for X episodes and returns average reward +# A fixed seed is used for the eval environment +def eval(env_name,agent, seed, eval_episodes=50): + eval_env = gym.make(env_name) + eval_env.seed(seed + 100) + rewards,ma_rewards =[],[] + for i_episode in range(eval_episodes): + ep_reward = 0 + state, done = eval_env.reset(), False + while not done: + eval_env.render() + action = agent.choose_action(np.array(state)) + state, reward, done, _ = eval_env.step(action) + ep_reward += reward + print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}") + rewards.append(ep_reward) + # 计算滑动窗口的reward + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + return rewards,ma_rewards + +if __name__ == "__main__": + cfg = TD3Config() + env = gym.make(cfg.env) + env.seed(cfg.seed) # Set seeds + torch.manual_seed(cfg.seed) + np.random.seed(cfg.seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + max_action = float(env.action_space.high[0]) + td3= TD3(state_dim,action_dim,max_action,cfg) + cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/' + td3.load(cfg.model_path) + td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed) + make_dir(cfg.result_path,cfg.model_path) + save_results(td3_rewards,td3_ma_rewards,tag='eval',path=cfg.result_path) + plot_rewards({'td3_rewards':td3_rewards,'td3_ma_rewards':td3_ma_rewards,},tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/' + # agent.load(cfg.result_path) + # eval(cfg.env,agent, cfg.seed) \ No newline at end of file diff --git a/codes/TD3/main.py b/codes/TD3/task0_train.py similarity index 93% rename from codes/TD3/main.py rename to codes/TD3/task0_train.py index b0766ec..11e2adf 100644 --- a/codes/TD3/main.py +++ b/codes/TD3/task0_train.py @@ -21,11 +21,12 @@ class TD3Config: self.algo = 'TD3' self.env = 'HalfCheetah-v2' self.seed = 0 - self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/' # path to save results + self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models self.start_timestep = 25e3 # Time steps initial random policy is used self.eval_freq = 5e3 # How often (time steps) we evaluate # self.train_eps = 800 - self.max_timestep = 1600000 # Max time steps to run environment + self.max_timestep = 4000000 # Max time steps to run environment self.expl_noise = 0.1 # Std of Gaussian exploration noise self.batch_size = 256 # Batch size for both actor and critic self.gamma = 0.99 # gamma factor @@ -161,9 +162,12 @@ if __name__ == "__main__": max_action = float(env.action_space.high[0]) agent = TD3(state_dim,action_dim,max_action,cfg) rewards,ma_rewards = train(cfg,env,agent) - make_dir(cfg.result_path) - agent.save(path=cfg.result_path) + make_dir(cfg.result_path,cfg.model_path) + agent.save(path=cfg.model_path) save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + # cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/' + # agent.load(cfg.result_path) + # eval(cfg.env,agent, cfg.seed) diff --git a/codes/TD3/task1_eval.py b/codes/TD3/task1_eval.py new file mode 100644 index 0000000..ae17681 --- /dev/null +++ b/codes/TD3/task1_eval.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-04-23 20:36:23 +LastEditor: JiangJi +LastEditTime: 2021-04-28 10:14:33 +Discription: +Environment: +''' +import sys,os +curr_path = os.path.dirname(__file__) +parent_path=os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + +import torch +import gym +import numpy as np +import datetime + + +from TD3.agent import TD3 +from common.plot import plot_rewards +from common.utils import save_results,make_dir + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time + +class TD3Config: + def __init__(self) -> None: + self.algo = 'TD3' + self.env = 'Pendulum-v0' + self.seed = 0 + self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models + self.batch_size = 256 # Batch size for both actor and critic + self.gamma = 0.99 # gamma factor + self.lr = 0.0005 # Target network update rate + self.policy_noise = 0.2 # Noise added to target policy during critic update + self.noise_clip = 0.5 # Range to clip target policy noise + self.policy_freq = 2 # Frequency of delayed policy updates + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Runs policy for X episodes and returns average reward +# A fixed seed is used for the eval environment +def eval(env_name,agent, seed, eval_episodes=50): + eval_env = gym.make(env_name) + eval_env.seed(seed + 100) + rewards,ma_rewards =[],[] + for i_episode in range(eval_episodes): + ep_reward = 0 + state, done = eval_env.reset(), False + while not done: + # eval_env.render() + action = agent.choose_action(np.array(state)) + state, reward, done, _ = eval_env.step(action) + ep_reward += reward + print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}") + rewards.append(ep_reward) + # 计算滑动窗口的reward + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + return rewards,ma_rewards + +if __name__ == "__main__": + cfg = TD3Config() + env = gym.make(cfg.env) + env.seed(cfg.seed) # Set seeds + torch.manual_seed(cfg.seed) + np.random.seed(cfg.seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + max_action = float(env.action_space.high[0]) + td3= TD3(state_dim,action_dim,max_action,cfg) + cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/' + cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/' + td3.load(cfg.model_path) + rewards,ma_rewards = eval(cfg.env,td3,cfg.seed) + make_dir(cfg.result_path,cfg.model_path) + save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) + plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) \ No newline at end of file diff --git a/codes/TD3/task1_train.py b/codes/TD3/task1_train.py new file mode 100644 index 0000000..5c0d533 --- /dev/null +++ b/codes/TD3/task1_train.py @@ -0,0 +1,112 @@ +import sys,os +curr_path = os.path.dirname(__file__) +parent_path=os.path.dirname(curr_path) +sys.path.append(parent_path) # add current terminal path to sys.path + +import torch +import gym +import numpy as np +import datetime + + +from TD3.agent import TD3 +from common.plot import plot_rewards +from common.utils import save_results,make_dir + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time + + +class TD3Config: + def __init__(self) -> None: + self.algo = 'TD3' + self.env = 'Pendulum-v0' + self.seed = 0 + self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results + self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models + self.start_timestep = 25e3 # Time steps initial random policy is used + self.start_ep = 50 # Episodes initial random policy is used + self.eval_freq = 10 # How often (episodes) we evaluate + self.train_eps = 600 + self.max_timestep = 100000 # Max time steps to run environment + self.expl_noise = 0.1 # Std of Gaussian exploration noise + self.batch_size = 256 # Batch size for both actor and critic + self.gamma = 0.9 # gamma factor + self.lr = 0.0005 # Target network update rate + self.policy_noise = 0.2 # Noise added to target policy during critic update + self.noise_clip = 0.3 # Range to clip target policy noise + self.policy_freq = 2 # Frequency of delayed policy updates + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Runs policy for X episodes and returns average reward +# A fixed seed is used for the eval environment +def eval(env,agent, seed, eval_episodes=10): + eval_env = gym.make(env) + eval_env.seed(seed + 100) + avg_reward = 0. + for _ in range(eval_episodes): + state, done = eval_env.reset(), False + while not done: + # eval_env.render() + action = agent.choose_action(np.array(state)) + state, reward, done, _ = eval_env.step(action) + avg_reward += reward + avg_reward /= eval_episodes + print("---------------------------------------") + print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") + print("---------------------------------------") + return avg_reward + +def train(cfg,env,agent): + rewards = [] + ma_rewards = [] # moveing average reward + for i_ep in range(int(cfg.train_eps)): + ep_reward = 0 + ep_timesteps = 0 + state, done = env.reset(), False + while not done: + ep_timesteps += 1 + # Select action randomly or according to policy + if i_ep < cfg.start_ep: + action = env.action_space.sample() + else: + action = ( + agent.choose_action(np.array(state)) + + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim) + ).clip(-max_action, max_action) + # Perform action + next_state, reward, done, _ = env.step(action) + done_bool = float(done) if ep_timesteps < env._max_episode_steps else 0 + # Store data in replay buffer + agent.memory.push(state, action, next_state, reward, done_bool) + state = next_state + ep_reward += reward + # Train agent after collecting sufficient data + if i_ep+1 >= cfg.start_ep: + agent.update() + print(f"Episode:{i_ep+1}/{cfg.train_eps}, Step:{ep_timesteps}, Reward:{ep_reward:.3f}") + rewards.append(ep_reward) + # 计算滑动窗口的reward + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + return rewards, ma_rewards + + +if __name__ == "__main__": + cfg = TD3Config() + env = gym.make(cfg.env) + env.seed(cfg.seed) # Set seeds + torch.manual_seed(cfg.seed) + np.random.seed(cfg.seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + max_action = float(env.action_space.high[0]) + agent = TD3(state_dim,action_dim,max_action,cfg) + rewards,ma_rewards = train(cfg,env,agent) + make_dir(cfg.result_path,cfg.model_path) + agent.save(path=cfg.model_path) + save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) + plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + + diff --git a/codes/checkpoint.npy b/codes/checkpoint.npy deleted file mode 100644 index 591d49e..0000000 Binary files a/codes/checkpoint.npy and /dev/null differ diff --git a/codes/common/plot.py b/codes/common/plot.py index a4c3d62..8bf1689 100644 --- a/codes/common/plot.py +++ b/codes/common/plot.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-10-07 20:57:11 LastEditor: John -LastEditTime: 2021-04-08 21:45:09 +LastEditTime: 2021-04-28 10:13:21 Discription: Environment: ''' @@ -16,12 +16,21 @@ def plot_rewards(rewards,ma_rewards,tag="train",env='CartPole-v0',algo = "DQN",s plt.title("average learning curve of {} for {}".format(algo,env)) plt.xlabel('epsiodes') plt.plot(rewards,label='rewards') - plt.plot(ma_rewards,label='moving average rewards') + plt.plot(ma_rewards,label='ma rewards') plt.legend() if save: plt.savefig(path+"rewards_curve_{}".format(tag)) plt.show() - +# def plot_rewards(dic,tag="train",env='CartPole-v0',algo = "DQN",save=True,path='./'): +# sns.set() +# plt.title("average learning curve of {} for {}".format(algo,env)) +# plt.xlabel('epsiodes') +# for key, value in dic.items(): +# plt.plot(value,label=key) +# plt.legend() +# if save: +# plt.savefig(path+algo+"_rewards_curve_{}".format(tag)) +# plt.show() def plot_losses(losses,algo = "DQN",save=True,path='./'): sns.set() plt.title("loss curve of {}".format(algo))