update codes

This commit is contained in:
johnjim0816
2021-11-19 16:02:34 +08:00
parent 129c0c65fa
commit 64c319cab4
47 changed files with 262 additions and 255 deletions

View File

@@ -18,6 +18,7 @@ from PPO.memory import PPOMemory
class PPO: class PPO:
def __init__(self, state_dim, action_dim,cfg): def __init__(self, state_dim, action_dim,cfg):
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.continuous = cfg.continuous
self.policy_clip = cfg.policy_clip self.policy_clip = cfg.policy_clip
self.n_epochs = cfg.n_epochs self.n_epochs = cfg.n_epochs
self.gae_lambda = cfg.gae_lambda self.gae_lambda = cfg.gae_lambda
@@ -29,13 +30,13 @@ class PPO:
self.memory = PPOMemory(cfg.batch_size) self.memory = PPOMemory(cfg.batch_size)
self.loss = 0 self.loss = 0
def choose_action(self, state,continuous=False): def choose_action(self, state):
state = torch.tensor([state], dtype=torch.float).to(self.device) state = torch.tensor([state], dtype=torch.float).to(self.device)
dist = self.actor(state) dist = self.actor(state)
value = self.critic(state) value = self.critic(state)
action = dist.sample() action = dist.sample()
probs = torch.squeeze(dist.log_prob(action)).item() probs = torch.squeeze(dist.log_prob(action)).item()
if continuous: if self.continuous:
action = torch.tanh(action) action = torch.tanh(action)
else: else:
action = torch.squeeze(action).item() action = torch.squeeze(action).item()

67
codes/PPO/task0.py Normal file
View File

@@ -0,0 +1,67 @@
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.plot import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.eval_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
return env,agent
cfg = PPOConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

68
codes/PPO/task1.py Normal file
View File

@@ -0,0 +1,68 @@
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.plot import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig:
def __init__(self) -> None:
self.algo = "PPO" # 算法名称
self.env_name = 'Pendulum-v1' # 环境名称
self.continuous = True # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.eval_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "PPO" # 算法名称
self.env_name = 'Pendulum-v1' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = PPO(state_dim,action_dim,cfg)
return env,agent
cfg = PPOConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

View File

@@ -1,132 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-22 16:18:10
LastEditor: John
LastEditTime: 2021-09-26 22:05:00
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from PPO.agent import PPO
from common.plot import plot_rewards
from common.utils import save_results,make_dir
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
class PPOConfig:
def __init__(self) -> None:
self.algo = "PPO" # 算法名称
self.env_name = 'Pendulum-v1' # 环境名称
self.continuous = True # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.eval_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "PPO" # 算法名称
self.env_name = 'Pendulum-v1' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = PPO(state_dim,action_dim,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = 0
for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state,continuous=cfg.continuous)
print(action)
state_, reward, done, _ = env.step(action)
steps += 1
ep_reward += reward
agent.memory.push(state, action, prob, val, reward, done)
if steps % cfg.update_fre == 0:
agent.update()
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('完成训练!')
return rewards,ma_rewards
def eval(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.eval_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state,continuous=False)
state_, reward, done, _ = env.step(action)
ep_reward += reward
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward))
print('完成训练!')
return rewards,ma_rewards
if __name__ == '__main__':
cfg = PPOConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

View File

@@ -1,65 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-22 16:18:10
LastEditor: John
LastEditTime: 2021-09-26 22:05:00
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from PPO.agent import PPO
from common.plot import plot_rewards
from common.utils import save_results,make_dir
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
class PPOConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.eval_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
return env,agent
def train(cfg,env,agent): def train(cfg,env,agent):
print('开始训练!') print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
@@ -71,7 +9,7 @@ def train(cfg,env,agent):
done = False done = False
ep_reward = 0 ep_reward = 0
while not done: while not done:
action, prob, val = agent.choose_action(state,continuous=cfg.continuous) action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action) state_, reward, done, _ = env.step(action)
steps += 1 steps += 1
ep_reward += reward ep_reward += reward
@@ -99,7 +37,7 @@ def eval(cfg,env,agent):
done = False done = False
ep_reward = 0 ep_reward = 0
while not done: while not done:
action, prob, val = agent.choose_action(state,cfg.continuous) action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action) state_, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
state = state_ state = state_
@@ -112,8 +50,60 @@ def eval(cfg,env,agent):
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward)) print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward))
print('完成训练!') print('完成训练!')
return rewards,ma_rewards return rewards,ma_rewards
if __name__ == '__main__': if __name__ == '__main__':
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.plot import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.eval_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
return env,agent
cfg = PPOConfig() cfg = PPOConfig()
plot_cfg = PlotConfig() plot_cfg = PlotConfig()
# 训练 # 训练
@@ -128,4 +118,4 @@ if __name__ == '__main__':
agent.load(path=plot_cfg.model_path) agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent) rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

View File

@@ -1,6 +1,3 @@
[Eng](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README_en.md)|[中文](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/README.md)
## 写在前面 ## 写在前面
本项目用于学习RL基础算法尽量做到: **注释详细****结构清晰**。 本项目用于学习RL基础算法尽量做到: **注释详细****结构清晰**。
@@ -12,7 +9,7 @@
* ```plot.py``` 利用matplotlib或seaborn绘制rewards图包括滑动平均的reward结果保存在result文件夹中 * ```plot.py``` 利用matplotlib或seaborn绘制rewards图包括滑动平均的reward结果保存在result文件夹中
* ```env.py``` 用于构建强化学习环境也可以重新自定义环境比如给action加noise * ```env.py``` 用于构建强化学习环境也可以重新自定义环境比如给action加noise
* ```agent.py``` RL核心算法比如dqn等主要包含update和choose_action两个方法 * ```agent.py``` RL核心算法比如dqn等主要包含update和choose_action两个方法
* ```main.py``` 运行主函数 * ```train.py``` 保存用于训练和测试的函数
其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。 其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。
@@ -22,8 +19,8 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
## 使用说明 ## 使用说明
运行带有```train```的py文件或ipynb文件进行训练,如果前面带有```task```如```task0_train.py```表示对task0任务训练 直接运行带有```train```的py文件或ipynb文件进行训练默认的任务;
类似的带有```eval```即为测试。 也可以运行带有```task```的py文件训练不同的任务
## 内容导航 ## 内容导航

View File

@@ -10,10 +10,9 @@ Discription:
Environment: Environment:
''' '''
import sys,os import sys,os
curr_path = os.path.dirname(__file__) curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # add current terminal path to sys.path sys.path.append(parent_path) # 添加路径到系统路径
import gym import gym
import torch import torch
@@ -24,7 +23,7 @@ from SAC.agent import SAC
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.plot import plot_rewards from common.plot import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class SACConfig: class SACConfig:
def __init__(self) -> None: def __init__(self) -> None:
@@ -48,6 +47,14 @@ class SACConfig:
self.hidden_dim = 256 self.hidden_dim = 256
self.batch_size = 128 self.batch_size = 128
self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
class PlotConfig(SACConfig):
def __init__(self) -> None:
super().__init__()
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) env = NormalizedActions(gym.make(cfg.env_name))
@@ -58,13 +65,13 @@ def env_agent_config(cfg,seed=1):
return env,agent return env,agent
def train(cfg,env,agent): def train(cfg,env,agent):
print('Start to train !') print('开始训练!')
print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}') print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] rewards = [] # 记录所有回合的奖励
ma_rewards = [] # moveing average reward ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps): for i_ep in range(cfg.train_eps):
state = env.reset() ep_reward = 0 # 记录一回合内的奖励
ep_reward = 0 state = env.reset() # 重置环境,返回初始状态
for i_step in range(cfg.train_steps): for i_step in range(cfg.train_steps):
action = agent.policy_net.get_action(state) action = agent.policy_net.get_action(state)
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
@@ -111,21 +118,20 @@ def eval(cfg,env,agent):
if __name__ == "__main__": if __name__ == "__main__":
cfg=SACConfig() cfg=SACConfig()
plot_cfg = PlotConfig()
# train # train
env,agent = env_agent_config(cfg,seed=1) env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent) rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) make_dir(plot_cfg.result_path, plot_cfg.model_path)
agent.save(path=cfg.model_path) agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, tag="train", plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
algo=cfg.algo, path=cfg.result_path)
# eval # eval
env,agent = env_agent_config(cfg,seed=10) env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path) agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent) rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

View File

Before

Width:  |  Height:  |  Size: 55 KiB

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 70 KiB

View File

@@ -1,41 +1,47 @@
import sys,os import sys,os
curr_path = os.path.dirname(__file__) curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path=os.path.dirname(curr_path) parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # add current terminal path to sys.path sys.path.append(parent_path) # 添加路径到系统路径
import torch import torch
import gym import gym
import numpy as np import numpy as np
import datetime import datetime
from TD3.agent import TD3 from TD3.agent import TD3
from common.plot import plot_rewards from common.plot import plot_rewards
from common.utils import save_results,make_dir from common.utils import save_results,make_dir
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class TD3Config: class TD3Config:
def __init__(self) -> None: def __init__(self) -> None:
self.algo = 'TD3' self.algo = 'TD3' # 算法名称
self.env = 'Pendulum-v0' self.env_name = 'Pendulum-v1' # 环境名称
self.seed = 0 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results self.train_eps = 600 # 训练的回合数
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
self.start_timestep = 25e3 # Time steps initial random policy is used self.start_timestep = 25e3 # Time steps initial random policy is used
self.start_ep = 50 # Episodes initial random policy is used self.epsilon_start = 50 # Episodes initial random policy is used
self.eval_freq = 10 # How often (episodes) we evaluate self.eval_freq = 10 # How often (episodes) we evaluate
self.train_eps = 600
self.max_timestep = 100000 # Max time steps to run environment self.max_timestep = 100000 # Max time steps to run environment
self.expl_noise = 0.1 # Std of Gaussian exploration noise self.expl_noise = 0.1 # Std of Gaussian exploration noise
self.batch_size = 256 # Batch size for both actor and critic self.batch_size = 256 # Batch size for both actor and critic
self.gamma = 0.9 # gamma factor self.gamma = 0.9 # gamma factor
self.lr = 0.0005 # Target network update rate self.lr = 0.0005 # 学习率
self.policy_noise = 0.2 # Noise added to target policy during critic update self.policy_noise = 0.2 # Noise added to target policy during critic update
self.noise_clip = 0.3 # Range to clip target policy noise self.noise_clip = 0.3 # Range to clip target policy noise
self.policy_freq = 2 # Frequency of delayed policy updates self.policy_freq = 2 # Frequency of delayed policy updates
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class PlotConfig(TD3Config):
def __init__(self) -> None:
super().__init__()
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
# Runs policy for X episodes and returns average reward # Runs policy for X episodes and returns average reward
# A fixed seed is used for the eval environment # A fixed seed is used for the eval environment
@@ -57,8 +63,10 @@ def eval(env,agent, seed, eval_episodes=10):
return avg_reward return avg_reward
def train(cfg,env,agent): def train(cfg,env,agent):
rewards = [] print('开始训练!')
ma_rewards = [] # moveing average reward print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(int(cfg.train_eps)): for i_ep in range(int(cfg.train_eps)):
ep_reward = 0 ep_reward = 0
ep_timesteps = 0 ep_timesteps = 0
@@ -66,7 +74,7 @@ def train(cfg,env,agent):
while not done: while not done:
ep_timesteps += 1 ep_timesteps += 1
# Select action randomly or according to policy # Select action randomly or according to policy
if i_ep < cfg.start_ep: if i_ep < cfg.epsilon_start:
action = env.action_space.sample() action = env.action_space.sample()
else: else:
action = ( action = (
@@ -81,32 +89,34 @@ def train(cfg,env,agent):
state = next_state state = next_state
ep_reward += reward ep_reward += reward
# Train agent after collecting sufficient data # Train agent after collecting sufficient data
if i_ep+1 >= cfg.start_ep: if i_ep+1 >= cfg.epsilon_start:
agent.update() agent.update()
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Step:{ep_timesteps}, Reward:{ep_reward:.3f}") if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward) rewards.append(ep_reward)
# 计算滑动窗口的reward
if ma_rewards: if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards return rewards, ma_rewards
if __name__ == "__main__": if __name__ == "__main__":
cfg = TD3Config() cfg = TD3Config()
env = gym.make(cfg.env) plot_cfg = PlotConfig()
env.seed(cfg.seed) # Set seeds env = gym.make(cfg.env_name)
torch.manual_seed(cfg.seed) env.seed(1) # 随机种子
np.random.seed(cfg.seed) torch.manual_seed(1)
np.random.seed(1)
state_dim = env.observation_space.shape[0] state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0]) max_action = float(env.action_space.high[0])
agent = TD3(state_dim,action_dim,max_action,cfg) agent = TD3(state_dim,action_dim,max_action,cfg)
rewards,ma_rewards = train(cfg,env,agent) rewards,ma_rewards = train(cfg,env,agent)
make_dir(cfg.result_path,cfg.model_path) make_dir(plot_cfg.result_path,plot_cfg.model_path)
agent.save(path=cfg.model_path) agent.save(path=plot_cfg.model_path)
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) save_results(rewards,ma_rewards,tag='train',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) plot_rewards(rewards,ma_rewards,plot_cfg,tag="train")