diff --git a/codes/PPO/agent.py b/codes/PPO/agent.py index 8e669f6..0a7edd9 100644 --- a/codes/PPO/agent.py +++ b/codes/PPO/agent.py @@ -18,6 +18,7 @@ from PPO.memory import PPOMemory class PPO: def __init__(self, state_dim, action_dim,cfg): self.gamma = cfg.gamma + self.continuous = cfg.continuous self.policy_clip = cfg.policy_clip self.n_epochs = cfg.n_epochs self.gae_lambda = cfg.gae_lambda @@ -29,13 +30,13 @@ class PPO: self.memory = PPOMemory(cfg.batch_size) self.loss = 0 - def choose_action(self, state,continuous=False): + def choose_action(self, state): state = torch.tensor([state], dtype=torch.float).to(self.device) dist = self.actor(state) value = self.critic(state) action = dist.sample() probs = torch.squeeze(dist.log_prob(action)).item() - if continuous: + if self.continuous: action = torch.tanh(action) else: action = torch.squeeze(action).item() diff --git a/codes/PPO/task0.py b/codes/PPO/task0.py new file mode 100644 index 0000000..cd55eda --- /dev/null +++ b/codes/PPO/task0.py @@ -0,0 +1,67 @@ +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +from common.plot import plot_rewards +from common.utils import save_results,make_dir +from PPO.agent import PPO +from PPO.train import train + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 + +class PPOConfig: + def __init__(self) -> None: + self.algo = "DQN" # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.continuous = False # 环境是否为连续动作 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.eval_eps = 20 # 测试的回合数 + self.batch_size = 5 + self.gamma=0.99 + self.n_epochs = 4 + self.actor_lr = 0.0003 + self.critic_lr = 0.0003 + self.gae_lambda=0.95 + self.policy_clip=0.2 + self.hidden_dim = 256 + self.update_fre = 20 # frequency of agent update + +class PlotConfig: + def __init__(self) -> None: + self.algo = "DQN" # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = PPO(state_dim,action_dim,cfg) + return env,agent + +cfg = PPOConfig() +plot_cfg = PlotConfig() +# 训练 +env,agent = env_agent_config(cfg,seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 +agent.save(path=plot_cfg.model_path) +save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") +# 测试 +env,agent = env_agent_config(cfg,seed=10) +agent.load(path=plot_cfg.model_path) +rewards,ma_rewards = eval(cfg,env,agent) +save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) +plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") \ No newline at end of file diff --git a/codes/PPO/task1.py b/codes/PPO/task1.py new file mode 100644 index 0000000..178efba --- /dev/null +++ b/codes/PPO/task1.py @@ -0,0 +1,68 @@ +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +from common.plot import plot_rewards +from common.utils import save_results,make_dir +from PPO.agent import PPO +from PPO.train import train + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 + +class PPOConfig: + def __init__(self) -> None: + self.algo = "PPO" # 算法名称 + self.env_name = 'Pendulum-v1' # 环境名称 + self.continuous = True # 环境是否为连续动作 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.eval_eps = 20 # 测试的回合数 + self.batch_size = 5 + self.gamma=0.99 + self.n_epochs = 4 + self.actor_lr = 0.0003 + self.critic_lr = 0.0003 + self.gae_lambda=0.95 + self.policy_clip=0.2 + self.hidden_dim = 256 + self.update_fre = 20 # frequency of agent update + +class PlotConfig: + def __init__(self) -> None: + self.algo = "PPO" # 算法名称 + self.env_name = 'Pendulum-v1' # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + agent = PPO(state_dim,action_dim,cfg) + return env,agent + + +cfg = PPOConfig() +plot_cfg = PlotConfig() +# 训练 +env,agent = env_agent_config(cfg,seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 +agent.save(path=plot_cfg.model_path) +save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") +# 测试 +env,agent = env_agent_config(cfg,seed=10) +agent.load(path=plot_cfg.model_path) +rewards,ma_rewards = eval(cfg,env,agent) +save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) +plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") \ No newline at end of file diff --git a/codes/PPO/task1_train.py b/codes/PPO/task1_train.py deleted file mode 100644 index ff2a6b2..0000000 --- a/codes/PPO/task1_train.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2021-03-22 16:18:10 -LastEditor: John -LastEditTime: 2021-09-26 22:05:00 -Discription: -Environment: -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime -from PPO.agent import PPO -from common.plot import plot_rewards -from common.utils import save_results,make_dir - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time - -class PPOConfig: - def __init__(self) -> None: - self.algo = "PPO" # 算法名称 - self.env_name = 'Pendulum-v1' # 环境名称 - self.continuous = True # 环境是否为连续动作 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 200 # 训练的回合数 - self.eval_eps = 20 # 测试的回合数 - self.batch_size = 5 - self.gamma=0.99 - self.n_epochs = 4 - self.actor_lr = 0.0003 - self.critic_lr = 0.0003 - self.gae_lambda=0.95 - self.policy_clip=0.2 - self.hidden_dim = 256 - self.update_fre = 20 # frequency of agent update - -class PlotConfig: - def __init__(self) -> None: - self.algo = "PPO" # 算法名称 - self.env_name = 'Pendulum-v1' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env_name) - env.seed(seed) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.shape[0] - agent = PPO(state_dim,action_dim,cfg) - return env,agent - -def train(cfg,env,agent): - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - steps = 0 - for i_ep in range(cfg.train_eps): - state = env.reset() - done = False - ep_reward = 0 - while not done: - action, prob, val = agent.choose_action(state,continuous=cfg.continuous) - print(action) - state_, reward, done, _ = env.step(action) - steps += 1 - ep_reward += reward - agent.memory.push(state, action, prob, val, reward, done) - if steps % cfg.update_fre == 0: - agent.update() - state = state_ - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - if (i_ep+1)%10 == 0: - print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}") - print('完成训练!') - return rewards,ma_rewards - -def eval(cfg,env,agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.eval_eps): - state = env.reset() - done = False - ep_reward = 0 - while not done: - action, prob, val = agent.choose_action(state,continuous=False) - state_, reward, done, _ = env.step(action) - ep_reward += reward - state = state_ - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward)) - print('完成训练!') - return rewards,ma_rewards - -if __name__ == '__main__': - cfg = PPOConfig() - plot_cfg = PlotConfig() - # 训练 - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=plot_cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") - # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=plot_cfg.model_path) - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) - plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") diff --git a/codes/PPO/task0_train.ipynb b/codes/PPO/train.ipynb similarity index 100% rename from codes/PPO/task0_train.ipynb rename to codes/PPO/train.ipynb diff --git a/codes/PPO/task0_train.py b/codes/PPO/train.py similarity index 52% rename from codes/PPO/task0_train.py rename to codes/PPO/train.py index e1354c6..aff54bf 100644 --- a/codes/PPO/task0_train.py +++ b/codes/PPO/train.py @@ -1,65 +1,3 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2021-03-22 16:18:10 -LastEditor: John -LastEditTime: 2021-09-26 22:05:00 -Discription: -Environment: -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime -from PPO.agent import PPO -from common.plot import plot_rewards -from common.utils import save_results,make_dir - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time - -class PPOConfig: - def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.continuous = False # 环境是否为连续动作 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 200 # 训练的回合数 - self.eval_eps = 20 # 测试的回合数 - self.batch_size = 5 - self.gamma=0.99 - self.n_epochs = 4 - self.actor_lr = 0.0003 - self.critic_lr = 0.0003 - self.gae_lambda=0.95 - self.policy_clip=0.2 - self.hidden_dim = 256 - self.update_fre = 20 # frequency of agent update - -class PlotConfig: - def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env_name) - env.seed(seed) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = PPO(state_dim,action_dim,cfg) - return env,agent - def train(cfg,env,agent): print('开始训练!') print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') @@ -71,7 +9,7 @@ def train(cfg,env,agent): done = False ep_reward = 0 while not done: - action, prob, val = agent.choose_action(state,continuous=cfg.continuous) + action, prob, val = agent.choose_action(state) state_, reward, done, _ = env.step(action) steps += 1 ep_reward += reward @@ -99,7 +37,7 @@ def eval(cfg,env,agent): done = False ep_reward = 0 while not done: - action, prob, val = agent.choose_action(state,cfg.continuous) + action, prob, val = agent.choose_action(state) state_, reward, done, _ = env.step(action) ep_reward += reward state = state_ @@ -112,8 +50,60 @@ def eval(cfg,env,agent): print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward)) print('完成训练!') return rewards,ma_rewards - + if __name__ == '__main__': + import sys,os + curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 + parent_path = os.path.dirname(curr_path) # 父路径 + sys.path.append(parent_path) # 添加路径到系统路径 + + import gym + import torch + import datetime + from common.plot import plot_rewards + from common.utils import save_results,make_dir + from PPO.agent import PPO + from PPO.train import train + + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 + + class PPOConfig: + def __init__(self) -> None: + self.algo = "DQN" # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.continuous = False # 环境是否为连续动作 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.eval_eps = 20 # 测试的回合数 + self.batch_size = 5 + self.gamma=0.99 + self.n_epochs = 4 + self.actor_lr = 0.0003 + self.critic_lr = 0.0003 + self.gae_lambda=0.95 + self.policy_clip=0.2 + self.hidden_dim = 256 + self.update_fre = 20 # frequency of agent update + + class PlotConfig: + def __init__(self) -> None: + self.algo = "DQN" # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + + def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = PPO(state_dim,action_dim,cfg) + return env,agent + cfg = PPOConfig() plot_cfg = PlotConfig() # 训练 @@ -128,4 +118,4 @@ if __name__ == '__main__': agent.load(path=plot_cfg.model_path) rewards,ma_rewards = eval(cfg,env,agent) save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) - plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") + plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") \ No newline at end of file diff --git a/codes/README.md b/codes/README.md index 2c421ae..fdee344 100644 --- a/codes/README.md +++ b/codes/README.md @@ -1,6 +1,3 @@ - -[Eng](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README_en.md)|[中文](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README.md) - ## 写在前面 本项目用于学习RL基础算法,尽量做到: **注释详细**,**结构清晰**。 @@ -12,7 +9,7 @@ * ```plot.py``` 利用matplotlib或seaborn绘制rewards图,包括滑动平均的reward,结果保存在result文件夹中 * ```env.py``` 用于构建强化学习环境,也可以重新自定义环境,比如给action加noise * ```agent.py``` RL核心算法,比如dqn等,主要包含update和choose_action两个方法, -* ```main.py``` 运行主函数 +* ```train.py``` 保存用于训练和测试的函数 其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。 @@ -22,8 +19,8 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0 ## 使用说明 -运行带有```train```的py文件或ipynb文件进行训练,如果前面带有```task```如```task0_train.py```,表示对task0任务训练, -类似的带有```eval```即为测试。 +直接运行带有```train```的py文件或ipynb文件会进行训练默认的任务; +也可以运行带有```task```的py文件训练不同的任务 ## 内容导航 diff --git a/codes/SAC/task0_train.py b/codes/SAC/task0_train.py index 4bc7221..625f1d7 100644 --- a/codes/SAC/task0_train.py +++ b/codes/SAC/task0_train.py @@ -10,10 +10,9 @@ Discription: Environment: ''' import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 import gym import torch @@ -24,7 +23,7 @@ from SAC.agent import SAC from common.utils import save_results, make_dir from common.plot import plot_rewards -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 class SACConfig: def __init__(self) -> None: @@ -48,6 +47,14 @@ class SACConfig: self.hidden_dim = 256 self.batch_size = 128 self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu") +class PlotConfig(SACConfig): + def __init__(self) -> None: + super().__init__() + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 def env_agent_config(cfg,seed=1): env = NormalizedActions(gym.make(cfg.env_name)) @@ -58,13 +65,13 @@ def env_agent_config(cfg,seed=1): return env,agent def train(cfg,env,agent): - print('Start to train !') - print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): - state = env.reset() - ep_reward = 0 + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 for i_step in range(cfg.train_steps): action = agent.policy_net.get_action(state) next_state, reward, done, _ = env.step(action) @@ -111,21 +118,20 @@ def eval(cfg,env,agent): if __name__ == "__main__": cfg=SACConfig() - + plot_cfg = PlotConfig() # train env,agent = env_agent_config(cfg,seed=1) rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) + make_dir(plot_cfg.result_path, plot_cfg.model_path) + agent.save(path=plot_cfg.model_path) + save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) + plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # eval env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) + agent.load(path=plot_cfg.model_path) rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) + plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_actor similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_actor diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_actor_optimizer diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_critic similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_critic diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/models/td3_critic_optimizer diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/results/ma_rewards_train.npy diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/results/rewards_curve_train.png diff --git a/codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_train.npy b/codes/TD3/outputs/HalfCheetah-v2/20210416-130341/results/rewards_train.npy similarity index 100% rename from codes/TD3/results/HalfCheetah-v2/20210416-130341/results/rewards_train.npy rename to codes/TD3/outputs/HalfCheetah-v2/20210416-130341/results/rewards_train.npy diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_actor b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_actor new file mode 100644 index 0000000..40533d9 Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_actor differ diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_actor_optimizer b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_actor_optimizer new file mode 100644 index 0000000..e91a68f Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_actor_optimizer differ diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_critic b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_critic new file mode 100644 index 0000000..ef6b3e5 Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_critic differ diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_critic_optimizer b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_critic_optimizer new file mode 100644 index 0000000..8094beb Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/models/td3_critic_optimizer differ diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_ma_rewards.npy b/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_ma_rewards.npy new file mode 100644 index 0000000..288eb69 Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_ma_rewards.npy differ diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_rewards.npy b/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_rewards.npy new file mode 100644 index 0000000..5bdee4a Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_rewards.npy differ diff --git a/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_rewards_curve.png b/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_rewards_curve.png new file mode 100644 index 0000000..31e873c Binary files /dev/null and b/codes/TD3/outputs/Pendulum-v1/20211119-123814/results/train_rewards_curve.png differ diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/ma_rewards_train.npy b/codes/TD3/outputs/Reacher-v2/20210415-021952/ma_rewards_train.npy similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/ma_rewards_train.npy rename to codes/TD3/outputs/Reacher-v2/20210415-021952/ma_rewards_train.npy diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/rewards_curve_train.png b/codes/TD3/outputs/Reacher-v2/20210415-021952/rewards_curve_train.png similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/rewards_curve_train.png rename to codes/TD3/outputs/Reacher-v2/20210415-021952/rewards_curve_train.png diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/rewards_train.npy b/codes/TD3/outputs/Reacher-v2/20210415-021952/rewards_train.npy similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/rewards_train.npy rename to codes/TD3/outputs/Reacher-v2/20210415-021952/rewards_train.npy diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/td3_actor b/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_actor similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/td3_actor rename to codes/TD3/outputs/Reacher-v2/20210415-021952/td3_actor diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/td3_actor_optimizer b/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_actor_optimizer similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/td3_actor_optimizer rename to codes/TD3/outputs/Reacher-v2/20210415-021952/td3_actor_optimizer diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/td3_critic b/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_critic similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/td3_critic rename to codes/TD3/outputs/Reacher-v2/20210415-021952/td3_critic diff --git a/codes/TD3/results/Reacher-v2/20210415-021952/td3_critic_optimizer b/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_critic_optimizer similarity index 100% rename from codes/TD3/results/Reacher-v2/20210415-021952/td3_critic_optimizer rename to codes/TD3/outputs/Reacher-v2/20210415-021952/td3_critic_optimizer diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/ma_rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210421-004751/ma_rewards_train.npy deleted file mode 100644 index 9680e2a..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/ma_rewards_train.npy and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_curve_train.png b/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_curve_train.png deleted file mode 100644 index a54dced..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_curve_train.png and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_train.npy b/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_train.npy deleted file mode 100644 index d9f349d..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/rewards_train.npy and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor deleted file mode 100644 index b154cc1..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor_optimizer b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor_optimizer deleted file mode 100644 index 28504b0..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor_optimizer and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic deleted file mode 100644 index 4bdff3f..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic and /dev/null differ diff --git a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic_optimizer b/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic_optimizer deleted file mode 100644 index 3fffdf3..0000000 Binary files a/codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic_optimizer and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor deleted file mode 100644 index ae4dfb2..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor_optimizer b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor_optimizer deleted file mode 100644 index 42755e1..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor_optimizer and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic deleted file mode 100644 index b6181a0..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic_optimizer b/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic_optimizer deleted file mode 100644 index b2ad60f..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic_optimizer and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/TD3_rewards_curve_eval.png b/codes/TD3/results/Pendulum-v0/20210428-092059/results/TD3_rewards_curve_eval.png deleted file mode 100644 index ee4d1af..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/results/TD3_rewards_curve_eval.png and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_eval.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_eval.npy deleted file mode 100644 index c2509fb..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_eval.npy and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_train.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_train.npy deleted file mode 100644 index 6181f14..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/results/ma_rewards_train.npy and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_curve_train.png b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_curve_train.png deleted file mode 100644 index 14a7455..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_curve_train.png and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_eval.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_eval.npy deleted file mode 100644 index 80fd298..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_eval.npy and /dev/null differ diff --git a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_train.npy b/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_train.npy deleted file mode 100644 index eadc498..0000000 Binary files a/codes/TD3/results/Pendulum-v0/20210428-092059/results/rewards_train.npy and /dev/null differ diff --git a/codes/TD3/task1_train.py b/codes/TD3/task1_train.py index 5c0d533..9780f76 100644 --- a/codes/TD3/task1_train.py +++ b/codes/TD3/task1_train.py @@ -1,41 +1,47 @@ import sys,os -curr_path = os.path.dirname(__file__) -parent_path=os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 import torch import gym import numpy as np import datetime - from TD3.agent import TD3 from common.plot import plot_rewards from common.utils import save_results,make_dir -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 class TD3Config: def __init__(self) -> None: - self.algo = 'TD3' - self.env = 'Pendulum-v0' - self.seed = 0 - self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results - self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models + self.algo = 'TD3' # 算法名称 + self.env_name = 'Pendulum-v1' # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 600 # 训练的回合数 self.start_timestep = 25e3 # Time steps initial random policy is used - self.start_ep = 50 # Episodes initial random policy is used + self.epsilon_start = 50 # Episodes initial random policy is used self.eval_freq = 10 # How often (episodes) we evaluate - self.train_eps = 600 self.max_timestep = 100000 # Max time steps to run environment self.expl_noise = 0.1 # Std of Gaussian exploration noise self.batch_size = 256 # Batch size for both actor and critic self.gamma = 0.9 # gamma factor - self.lr = 0.0005 # Target network update rate + self.lr = 0.0005 # 学习率 self.policy_noise = 0.2 # Noise added to target policy during critic update self.noise_clip = 0.3 # Range to clip target policy noise self.policy_freq = 2 # Frequency of delayed policy updates - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +class PlotConfig(TD3Config): + def __init__(self) -> None: + super().__init__() + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + + # Runs policy for X episodes and returns average reward # A fixed seed is used for the eval environment @@ -57,8 +63,10 @@ def eval(env,agent, seed, eval_episodes=10): return avg_reward def train(cfg,env,agent): - rewards = [] - ma_rewards = [] # moveing average reward + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(int(cfg.train_eps)): ep_reward = 0 ep_timesteps = 0 @@ -66,7 +74,7 @@ def train(cfg,env,agent): while not done: ep_timesteps += 1 # Select action randomly or according to policy - if i_ep < cfg.start_ep: + if i_ep < cfg.epsilon_start: action = env.action_space.sample() else: action = ( @@ -81,32 +89,34 @@ def train(cfg,env,agent): state = next_state ep_reward += reward # Train agent after collecting sufficient data - if i_ep+1 >= cfg.start_ep: + if i_ep+1 >= cfg.epsilon_start: agent.update() - print(f"Episode:{i_ep+1}/{cfg.train_eps}, Step:{ep_timesteps}, Reward:{ep_reward:.3f}") + if (i_ep+1)%10 == 0: + print('回合:{}/{}, 奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) - # 计算滑动窗口的reward if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: - ma_rewards.append(ep_reward) + ma_rewards.append(ep_reward) + print('完成训练!') return rewards, ma_rewards if __name__ == "__main__": cfg = TD3Config() - env = gym.make(cfg.env) - env.seed(cfg.seed) # Set seeds - torch.manual_seed(cfg.seed) - np.random.seed(cfg.seed) + plot_cfg = PlotConfig() + env = gym.make(cfg.env_name) + env.seed(1) # 随机种子 + torch.manual_seed(1) + np.random.seed(1) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) agent = TD3(state_dim,action_dim,max_action,cfg) rewards,ma_rewards = train(cfg,env,agent) - make_dir(cfg.result_path,cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + make_dir(plot_cfg.result_path,plot_cfg.model_path) + agent.save(path=plot_cfg.model_path) + save_results(rewards,ma_rewards,tag='train',path=plot_cfg.result_path) + plot_rewards(rewards,ma_rewards,plot_cfg,tag="train")