update
This commit is contained in:
@@ -92,14 +92,10 @@ class TD3(object):
|
||||
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
|
||||
self.memory = ReplayBuffer(state_dim, action_dim)
|
||||
|
||||
|
||||
|
||||
|
||||
def choose_action(self, state):
|
||||
state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
|
||||
return self.actor(state).cpu().data.numpy().flatten()
|
||||
|
||||
|
||||
def update(self):
|
||||
self.total_it += 1
|
||||
|
||||
@@ -167,4 +163,4 @@ class TD3(object):
|
||||
self.actor.load_state_dict(torch.load(path + "td3_actor"))
|
||||
self.actor_optimizer.load_state_dict(torch.load(path + "td3_actor_optimizer"))
|
||||
self.actor_target = copy.deepcopy(self.actor)
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 42 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 44 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 51 KiB |
Binary file not shown.
BIN
codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor
Normal file
BIN
codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_actor
Normal file
Binary file not shown.
Binary file not shown.
BIN
codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic
Normal file
BIN
codes/TD3/results/HalfCheetah-v2/20210421-004751/td3_critic
Normal file
Binary file not shown.
Binary file not shown.
BIN
codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor
Normal file
BIN
codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_actor
Normal file
Binary file not shown.
Binary file not shown.
BIN
codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic
Normal file
BIN
codes/TD3/results/Pendulum-v0/20210428-092059/models/td3_critic
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 56 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 70 KiB |
Binary file not shown.
Binary file not shown.
89
codes/TD3/task0_eval.py
Normal file
89
codes/TD3/task0_eval.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-23 20:36:23
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-04-23 20:37:22
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path=os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3 and Random'
|
||||
self.env = 'HalfCheetah-v2'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.start_timestep = 25e3 # Time steps initial random policy is used
|
||||
self.eval_freq = 5e3 # How often (time steps) we evaluate
|
||||
self.max_timestep = 200000 # Max time steps to run environment
|
||||
self.expl_noise = 0.1 # Std of Gaussian exploration noise
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.99 # gamma factor
|
||||
self.lr = 0.0005 # Target network update rate
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.5 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env_name,agent, seed, eval_episodes=50):
|
||||
eval_env = gym.make(env_name)
|
||||
eval_env.seed(seed + 100)
|
||||
rewards,ma_rewards =[],[]
|
||||
for i_episode in range(eval_episodes):
|
||||
ep_reward = 0
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
ep_reward += reward
|
||||
print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
# 计算滑动窗口的reward
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
return rewards,ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
td3= TD3(state_dim,action_dim,max_action,cfg)
|
||||
cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
|
||||
td3.load(cfg.model_path)
|
||||
td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
save_results(td3_rewards,td3_ma_rewards,tag='eval',path=cfg.result_path)
|
||||
plot_rewards({'td3_rewards':td3_rewards,'td3_ma_rewards':td3_ma_rewards,},tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
# cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/'
|
||||
# agent.load(cfg.result_path)
|
||||
# eval(cfg.env,agent, cfg.seed)
|
||||
@@ -21,11 +21,12 @@ class TD3Config:
|
||||
self.algo = 'TD3'
|
||||
self.env = 'HalfCheetah-v2'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/' # path to save results
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.start_timestep = 25e3 # Time steps initial random policy is used
|
||||
self.eval_freq = 5e3 # How often (time steps) we evaluate
|
||||
# self.train_eps = 800
|
||||
self.max_timestep = 1600000 # Max time steps to run environment
|
||||
self.max_timestep = 4000000 # Max time steps to run environment
|
||||
self.expl_noise = 0.1 # Std of Gaussian exploration noise
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.99 # gamma factor
|
||||
@@ -161,9 +162,12 @@ if __name__ == "__main__":
|
||||
max_action = float(env.action_space.high[0])
|
||||
agent = TD3(state_dim,action_dim,max_action,cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(cfg.result_path)
|
||||
agent.save(path=cfg.result_path)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
# cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/'
|
||||
# agent.load(cfg.result_path)
|
||||
# eval(cfg.env,agent, cfg.seed)
|
||||
|
||||
|
||||
83
codes/TD3/task1_eval.py
Normal file
83
codes/TD3/task1_eval.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-23 20:36:23
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-04-28 10:14:33
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path=os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3'
|
||||
self.env = 'Pendulum-v0'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.99 # gamma factor
|
||||
self.lr = 0.0005 # Target network update rate
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.5 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env_name,agent, seed, eval_episodes=50):
|
||||
eval_env = gym.make(env_name)
|
||||
eval_env.seed(seed + 100)
|
||||
rewards,ma_rewards =[],[]
|
||||
for i_episode in range(eval_episodes):
|
||||
ep_reward = 0
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
# eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
ep_reward += reward
|
||||
print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
# 计算滑动窗口的reward
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
return rewards,ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
td3= TD3(state_dim,action_dim,max_action,cfg)
|
||||
cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/'
|
||||
cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/'
|
||||
td3.load(cfg.model_path)
|
||||
rewards,ma_rewards = eval(cfg.env,td3,cfg.seed)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
112
codes/TD3/task1_train.py
Normal file
112
codes/TD3/task1_train.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path=os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3'
|
||||
self.env = 'Pendulum-v0'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.start_timestep = 25e3 # Time steps initial random policy is used
|
||||
self.start_ep = 50 # Episodes initial random policy is used
|
||||
self.eval_freq = 10 # How often (episodes) we evaluate
|
||||
self.train_eps = 600
|
||||
self.max_timestep = 100000 # Max time steps to run environment
|
||||
self.expl_noise = 0.1 # Std of Gaussian exploration noise
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.9 # gamma factor
|
||||
self.lr = 0.0005 # Target network update rate
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.3 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env,agent, seed, eval_episodes=10):
|
||||
eval_env = gym.make(env)
|
||||
eval_env.seed(seed + 100)
|
||||
avg_reward = 0.
|
||||
for _ in range(eval_episodes):
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
# eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
avg_reward += reward
|
||||
avg_reward /= eval_episodes
|
||||
print("---------------------------------------")
|
||||
print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
|
||||
print("---------------------------------------")
|
||||
return avg_reward
|
||||
|
||||
def train(cfg,env,agent):
|
||||
rewards = []
|
||||
ma_rewards = [] # moveing average reward
|
||||
for i_ep in range(int(cfg.train_eps)):
|
||||
ep_reward = 0
|
||||
ep_timesteps = 0
|
||||
state, done = env.reset(), False
|
||||
while not done:
|
||||
ep_timesteps += 1
|
||||
# Select action randomly or according to policy
|
||||
if i_ep < cfg.start_ep:
|
||||
action = env.action_space.sample()
|
||||
else:
|
||||
action = (
|
||||
agent.choose_action(np.array(state))
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
|
||||
).clip(-max_action, max_action)
|
||||
# Perform action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
done_bool = float(done) if ep_timesteps < env._max_episode_steps else 0
|
||||
# Store data in replay buffer
|
||||
agent.memory.push(state, action, next_state, reward, done_bool)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
# Train agent after collecting sufficient data
|
||||
if i_ep+1 >= cfg.start_ep:
|
||||
agent.update()
|
||||
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Step:{ep_timesteps}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
# 计算滑动窗口的reward
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
return rewards, ma_rewards
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
agent = TD3(state_dim,action_dim,max_action,cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user