update projects
This commit is contained in:
1
projects/codes/TD3/README.md
Normal file
1
projects/codes/TD3/README.md
Normal file
@@ -0,0 +1 @@
|
||||
这是对[Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)](https://arxiv.org/abs/1802.09477)的复现
|
||||
177
projects/codes/TD3/agent.py
Normal file
177
projects/codes/TD3/agent.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-12-22 10:40:05
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-12-22 10:43:55
|
||||
Discription:
|
||||
'''
|
||||
import copy
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from TD3.memory import ReplayBuffer
|
||||
|
||||
class Actor(nn.Module):
|
||||
|
||||
def __init__(self, input_dim, output_dim, max_action):
|
||||
'''[summary]
|
||||
|
||||
Args:
|
||||
input_dim (int): 输入维度,这里等于n_states
|
||||
output_dim (int): 输出维度,这里等于n_actions
|
||||
max_action (int): action的最大值
|
||||
'''
|
||||
super(Actor, self).__init__()
|
||||
|
||||
self.l1 = nn.Linear(input_dim, 256)
|
||||
self.l2 = nn.Linear(256, 256)
|
||||
self.l3 = nn.Linear(256, output_dim)
|
||||
self.max_action = max_action
|
||||
|
||||
def forward(self, state):
|
||||
|
||||
a = F.relu(self.l1(state))
|
||||
a = F.relu(self.l2(a))
|
||||
return self.max_action * torch.tanh(self.l3(a))
|
||||
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(Critic, self).__init__()
|
||||
|
||||
# Q1 architecture
|
||||
self.l1 = nn.Linear(input_dim + output_dim, 256)
|
||||
self.l2 = nn.Linear(256, 256)
|
||||
self.l3 = nn.Linear(256, 1)
|
||||
|
||||
# Q2 architecture
|
||||
self.l4 = nn.Linear(input_dim + output_dim, 256)
|
||||
self.l5 = nn.Linear(256, 256)
|
||||
self.l6 = nn.Linear(256, 1)
|
||||
|
||||
|
||||
def forward(self, state, action):
|
||||
sa = torch.cat([state, action], 1)
|
||||
|
||||
q1 = F.relu(self.l1(sa))
|
||||
q1 = F.relu(self.l2(q1))
|
||||
q1 = self.l3(q1)
|
||||
|
||||
q2 = F.relu(self.l4(sa))
|
||||
q2 = F.relu(self.l5(q2))
|
||||
q2 = self.l6(q2)
|
||||
return q1, q2
|
||||
|
||||
|
||||
def Q1(self, state, action):
|
||||
sa = torch.cat([state, action], 1)
|
||||
|
||||
q1 = F.relu(self.l1(sa))
|
||||
q1 = F.relu(self.l2(q1))
|
||||
q1 = self.l3(q1)
|
||||
return q1
|
||||
|
||||
|
||||
class TD3(object):
|
||||
def __init__(
|
||||
self,
|
||||
input_dim,
|
||||
output_dim,
|
||||
max_action,
|
||||
cfg,
|
||||
):
|
||||
self.max_action = max_action
|
||||
self.gamma = cfg.gamma
|
||||
self.lr = cfg.lr
|
||||
self.policy_noise = cfg.policy_noise
|
||||
self.noise_clip = cfg.noise_clip
|
||||
self.policy_freq = cfg.policy_freq
|
||||
self.batch_size = cfg.batch_size
|
||||
self.device = cfg.device
|
||||
self.total_it = 0
|
||||
|
||||
self.actor = Actor(input_dim, output_dim, max_action).to(self.device)
|
||||
self.actor_target = copy.deepcopy(self.actor)
|
||||
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
|
||||
|
||||
self.critic = Critic(input_dim, output_dim).to(self.device)
|
||||
self.critic_target = copy.deepcopy(self.critic)
|
||||
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
|
||||
self.memory = ReplayBuffer(input_dim, output_dim)
|
||||
|
||||
def choose_action(self, state):
|
||||
state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
|
||||
return self.actor(state).cpu().data.numpy().flatten()
|
||||
|
||||
def update(self):
|
||||
self.total_it += 1
|
||||
|
||||
# Sample replay buffer
|
||||
state, action, next_state, reward, not_done = self.memory.sample(self.batch_size)
|
||||
|
||||
with torch.no_grad():
|
||||
# Select action according to policy and add clipped noise
|
||||
noise = (
|
||||
torch.randn_like(action) * self.policy_noise
|
||||
).clamp(-self.noise_clip, self.noise_clip)
|
||||
|
||||
next_action = (
|
||||
self.actor_target(next_state) + noise
|
||||
).clamp(-self.max_action, self.max_action)
|
||||
|
||||
# Compute the target Q value
|
||||
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
|
||||
target_Q = torch.min(target_Q1, target_Q2)
|
||||
target_Q = reward + not_done * self.gamma * target_Q
|
||||
|
||||
# Get current Q estimates
|
||||
current_Q1, current_Q2 = self.critic(state, action)
|
||||
|
||||
# Compute critic loss
|
||||
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
|
||||
|
||||
# Optimize the critic
|
||||
self.critic_optimizer.zero_grad()
|
||||
critic_loss.backward()
|
||||
self.critic_optimizer.step()
|
||||
|
||||
# Delayed policy updates
|
||||
if self.total_it % self.policy_freq == 0:
|
||||
|
||||
# Compute actor losse
|
||||
actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
|
||||
|
||||
# Optimize the actor
|
||||
self.actor_optimizer.zero_grad()
|
||||
actor_loss.backward()
|
||||
self.actor_optimizer.step()
|
||||
|
||||
# Update the frozen target models
|
||||
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
|
||||
target_param.data.copy_(self.lr * param.data + (1 - self.lr) * target_param.data)
|
||||
|
||||
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
|
||||
target_param.data.copy_(self.lr * param.data + (1 - self.lr) * target_param.data)
|
||||
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.critic.state_dict(), path + "td3_critic")
|
||||
torch.save(self.critic_optimizer.state_dict(), path + "td3_critic_optimizer")
|
||||
|
||||
torch.save(self.actor.state_dict(), path + "td3_actor")
|
||||
torch.save(self.actor_optimizer.state_dict(), path + "td3_actor_optimizer")
|
||||
|
||||
|
||||
def load(self, path):
|
||||
self.critic.load_state_dict(torch.load(path + "td3_critic"))
|
||||
self.critic_optimizer.load_state_dict(torch.load(path + "td3_critic_optimizer"))
|
||||
self.critic_target = copy.deepcopy(self.critic)
|
||||
|
||||
self.actor.load_state_dict(torch.load(path + "td3_actor"))
|
||||
self.actor_optimizer.load_state_dict(torch.load(path + "td3_actor_optimizer"))
|
||||
self.actor_target = copy.deepcopy(self.actor)
|
||||
|
||||
44
projects/codes/TD3/memory.py
Normal file
44
projects/codes/TD3/memory.py
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-13 11:00:13
|
||||
LastEditor: John
|
||||
LastEditTime: 2021-04-15 01:25:14
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
class ReplayBuffer(object):
|
||||
def __init__(self, n_states, n_actions, max_size=int(1e6)):
|
||||
self.max_size = max_size
|
||||
self.ptr = 0
|
||||
self.size = 0
|
||||
self.state = np.zeros((max_size, n_states))
|
||||
self.action = np.zeros((max_size, n_actions))
|
||||
self.next_state = np.zeros((max_size, n_states))
|
||||
self.reward = np.zeros((max_size, 1))
|
||||
self.not_done = np.zeros((max_size, 1))
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
def push(self, state, action, next_state, reward, done):
|
||||
self.state[self.ptr] = state
|
||||
self.action[self.ptr] = action
|
||||
self.next_state[self.ptr] = next_state
|
||||
self.reward[self.ptr] = reward
|
||||
self.not_done[self.ptr] = 1. - done
|
||||
self.ptr = (self.ptr + 1) % self.max_size
|
||||
self.size = min(self.size + 1, self.max_size)
|
||||
|
||||
def sample(self, batch_size):
|
||||
ind = np.random.randint(0, self.size, size=batch_size)
|
||||
return (
|
||||
torch.FloatTensor(self.state[ind]).to(self.device),
|
||||
torch.FloatTensor(self.action[ind]).to(self.device),
|
||||
torch.FloatTensor(self.next_state[ind]).to(self.device),
|
||||
torch.FloatTensor(self.reward[ind]).to(self.device),
|
||||
torch.FloatTensor(self.not_done[ind]).to(self.device)
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 44 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 67 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 55 KiB |
Binary file not shown.
BIN
projects/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_actor
Normal file
BIN
projects/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_actor
Normal file
Binary file not shown.
Binary file not shown.
BIN
projects/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_critic
Normal file
BIN
projects/codes/TD3/outputs/Reacher-v2/20210415-021952/td3_critic
Normal file
Binary file not shown.
Binary file not shown.
89
projects/codes/TD3/task0_eval.py
Normal file
89
projects/codes/TD3/task0_eval.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-23 20:36:23
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-04-23 20:37:22
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path=os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3 and Random'
|
||||
self.env = 'HalfCheetah-v2'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.start_timestep = 25e3 # Time steps initial random policy is used
|
||||
self.eval_freq = 5e3 # How often (time steps) we evaluate
|
||||
self.max_timestep = 200000 # Max time steps to run environment
|
||||
self.expl_noise = 0.1 # Std of Gaussian exploration noise
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.99 # gamma factor
|
||||
self.lr = 0.0005 # Target network update rate
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.5 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env_name,agent, seed, eval_episodes=50):
|
||||
eval_env = gym.make(env_name)
|
||||
eval_env.seed(seed + 100)
|
||||
rewards,ma_rewards =[],[]
|
||||
for i_episode in range(eval_episodes):
|
||||
ep_reward = 0
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
ep_reward += reward
|
||||
print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
# 计算滑动窗口的reward
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
return rewards,ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
td3= TD3(n_states,n_actions,max_action,cfg)
|
||||
cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
|
||||
td3.load(cfg.model_path)
|
||||
td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
save_results(td3_rewards,td3_ma_rewards,tag='eval',path=cfg.result_path)
|
||||
plot_rewards({'td3_rewards':td3_rewards,'td3_ma_rewards':td3_ma_rewards,},tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
# cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/'
|
||||
# agent.load(cfg.result_path)
|
||||
# eval(cfg.env,agent, cfg.seed)
|
||||
173
projects/codes/TD3/task0_train.py
Normal file
173
projects/codes/TD3/task0_train.py
Normal file
@@ -0,0 +1,173 @@
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path=os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3'
|
||||
self.env = 'HalfCheetah-v2'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.start_timestep = 25e3 # Time steps initial random policy is used
|
||||
self.eval_freq = 5e3 # How often (time steps) we evaluate
|
||||
# self.train_eps = 800
|
||||
self.max_timestep = 4000000 # Max time steps to run environment
|
||||
self.expl_noise = 0.1 # Std of Gaussian exploration noise
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.99 # gamma factor
|
||||
self.lr = 0.0005 # Target network update rate
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.5 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env,agent, seed, eval_episodes=10):
|
||||
eval_env = gym.make(env)
|
||||
eval_env.seed(seed + 100)
|
||||
avg_reward = 0.
|
||||
for _ in range(eval_episodes):
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
# eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
avg_reward += reward
|
||||
avg_reward /= eval_episodes
|
||||
print("---------------------------------------")
|
||||
print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
|
||||
print("---------------------------------------")
|
||||
return avg_reward
|
||||
|
||||
def train(cfg,env,agent):
|
||||
# Evaluate untrained policy
|
||||
evaluations = [eval(cfg.env,agent, cfg.seed)]
|
||||
state, done = env.reset(), False
|
||||
ep_reward = 0
|
||||
ep_timesteps = 0
|
||||
episode_num = 0
|
||||
rewards = []
|
||||
ma_rewards = [] # moveing average reward
|
||||
for t in range(int(cfg.max_timestep)):
|
||||
ep_timesteps += 1
|
||||
# Select action randomly or according to policy
|
||||
if t < cfg.start_timestep:
|
||||
action = env.action_space.sample()
|
||||
else:
|
||||
action = (
|
||||
agent.choose_action(np.array(state))
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
).clip(-max_action, max_action)
|
||||
# Perform action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
done_bool = float(done) if ep_timesteps < env._max_episode_steps else 0
|
||||
# Store data in replay buffer
|
||||
agent.memory.push(state, action, next_state, reward, done_bool)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
# Train agent after collecting sufficient data
|
||||
if t >= cfg.start_timestep:
|
||||
agent.update()
|
||||
if done:
|
||||
# +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
|
||||
print(f"Episode:{episode_num+1}, Episode T:{ep_timesteps}, Reward:{ep_reward:.3f}")
|
||||
# Reset environment
|
||||
state, done = env.reset(), False
|
||||
rewards.append(ep_reward)
|
||||
# 计算滑动窗口的reward
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
ep_reward = 0
|
||||
ep_timesteps = 0
|
||||
episode_num += 1
|
||||
# Evaluate episode
|
||||
if (t + 1) % cfg.eval_freq == 0:
|
||||
evaluations.append(eval(cfg.env,agent, cfg.seed))
|
||||
return rewards, ma_rewards
|
||||
# def train(cfg,env,agent):
|
||||
# evaluations = [eval(cfg.env,agent,cfg.seed)]
|
||||
# ep_reward = 0
|
||||
# tot_timestep = 0
|
||||
# rewards = []
|
||||
# ma_rewards = [] # moveing average reward
|
||||
# for i_ep in range(int(cfg.train_eps)):
|
||||
# state, done = env.reset(), False
|
||||
# ep_reward = 0
|
||||
# ep_timestep = 0
|
||||
# while not done:
|
||||
# ep_timestep += 1
|
||||
# tot_timestep +=1
|
||||
# # Select action randomly or according to policy
|
||||
# if tot_timestep < cfg.start_timestep:
|
||||
# action = env.action_space.sample()
|
||||
# else:
|
||||
# action = (
|
||||
# agent.choose_action(np.array(state))
|
||||
# + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
# ).clip(-max_action, max_action)
|
||||
# # action = (
|
||||
# # agent.choose_action(np.array(state))
|
||||
# # + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
# # ).clip(-max_action, max_action)
|
||||
# # Perform action
|
||||
# next_state, reward, done, _ = env.step(action)
|
||||
# done_bool = float(done) if ep_timestep < env._max_episode_steps else 0
|
||||
|
||||
# # Store data in replay buffer
|
||||
# agent.memory.push(state, action, next_state, reward, done_bool)
|
||||
# state = next_state
|
||||
# ep_reward += reward
|
||||
# # Train agent after collecting sufficient data
|
||||
# if tot_timestep >= cfg.start_timestep:
|
||||
# agent.update()
|
||||
# print(f"Episode:{i_ep}/{cfg.train_eps}, Episode Timestep:{ep_timestep}, Reward:{ep_reward:.3f}")
|
||||
# rewards.append(ep_reward)
|
||||
# # 计算滑动窗口的reward
|
||||
# if ma_rewards:
|
||||
# ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
# else:
|
||||
# ma_rewards.append(ep_reward)
|
||||
# # Evaluate episode
|
||||
# if (i_ep+1) % cfg.eval_freq == 0:
|
||||
# evaluations.append(eval(cfg.env,agent, cfg.seed))
|
||||
# return rewards,ma_rewards
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
agent = TD3(n_states,n_actions,max_action,cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
# cfg.result_path = './TD3/results/HalfCheetah-v2/20210416-130341/'
|
||||
# agent.load(cfg.result_path)
|
||||
# eval(cfg.env,agent, cfg.seed)
|
||||
|
||||
|
||||
83
projects/codes/TD3/task1_eval.py
Normal file
83
projects/codes/TD3/task1_eval.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-23 20:36:23
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-04-28 10:14:33
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path=os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3'
|
||||
self.env = 'Pendulum-v0'
|
||||
self.seed = 0
|
||||
self.result_path = curr_path+"/results/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/results/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.99 # gamma factor
|
||||
self.lr = 0.0005 # Target network update rate
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.5 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env_name,agent, seed, eval_episodes=50):
|
||||
eval_env = gym.make(env_name)
|
||||
eval_env.seed(seed + 100)
|
||||
rewards,ma_rewards =[],[]
|
||||
for i_episode in range(eval_episodes):
|
||||
ep_reward = 0
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
# eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
ep_reward += reward
|
||||
print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
# 计算滑动窗口的reward
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
return rewards,ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
td3= TD3(n_states,n_actions,max_action,cfg)
|
||||
cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/'
|
||||
cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/'
|
||||
td3.load(cfg.model_path)
|
||||
rewards,ma_rewards = eval(cfg.env,td3,cfg.seed)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
122
projects/codes/TD3/task1_train.py
Normal file
122
projects/codes/TD3/task1_train.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import torch
|
||||
import gym
|
||||
import numpy as np
|
||||
import datetime
|
||||
|
||||
from TD3.agent import TD3
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
|
||||
|
||||
class TD3Config:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'TD3' # 算法名称
|
||||
self.env_name = 'Pendulum-v1' # 环境名称
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||
self.train_eps = 600 # 训练的回合数
|
||||
self.start_timestep = 25e3 # Time steps initial random policy is used
|
||||
self.epsilon_start = 50 # Episodes initial random policy is used
|
||||
self.eval_freq = 10 # How often (episodes) we evaluate
|
||||
self.max_timestep = 100000 # Max time steps to run environment
|
||||
self.expl_noise = 0.1 # Std of Gaussian exploration noise
|
||||
self.batch_size = 256 # Batch size for both actor and critic
|
||||
self.gamma = 0.9 # gamma factor
|
||||
self.lr = 0.0005 # 学习率
|
||||
self.policy_noise = 0.2 # Noise added to target policy during critic update
|
||||
self.noise_clip = 0.3 # Range to clip target policy noise
|
||||
self.policy_freq = 2 # Frequency of delayed policy updates
|
||||
class PlotConfig(TD3Config):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.result_path = curr_path+"/outputs/" + self.env_name + \
|
||||
'/'+curr_time+'/results/' # 保存结果的路径
|
||||
self.model_path = curr_path+"/outputs/" + self.env_name + \
|
||||
'/'+curr_time+'/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
|
||||
|
||||
|
||||
# Runs policy for X episodes and returns average reward
|
||||
# A fixed seed is used for the eval environment
|
||||
def eval(env,agent, seed, eval_episodes=10):
|
||||
eval_env = gym.make(env)
|
||||
eval_env.seed(seed + 100)
|
||||
avg_reward = 0.
|
||||
for _ in range(eval_episodes):
|
||||
state, done = eval_env.reset(), False
|
||||
while not done:
|
||||
# eval_env.render()
|
||||
action = agent.choose_action(np.array(state))
|
||||
state, reward, done, _ = eval_env.step(action)
|
||||
avg_reward += reward
|
||||
avg_reward /= eval_episodes
|
||||
print("---------------------------------------")
|
||||
print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
|
||||
print("---------------------------------------")
|
||||
return avg_reward
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(int(cfg.train_eps)):
|
||||
ep_reward = 0
|
||||
ep_timesteps = 0
|
||||
state, done = env.reset(), False
|
||||
while not done:
|
||||
ep_timesteps += 1
|
||||
# Select action randomly or according to policy
|
||||
if i_ep < cfg.epsilon_start:
|
||||
action = env.action_space.sample()
|
||||
else:
|
||||
action = (
|
||||
agent.choose_action(np.array(state))
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
).clip(-max_action, max_action)
|
||||
# Perform action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
done_bool = float(done) if ep_timesteps < env._max_episode_steps else 0
|
||||
# Store data in replay buffer
|
||||
agent.memory.push(state, action, next_state, reward, done_bool)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
# Train agent after collecting sufficient data
|
||||
if i_ep+1 >= cfg.epsilon_start:
|
||||
agent.update()
|
||||
if (i_ep+1)%10 == 0:
|
||||
print('回合:{}/{}, 奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('完成训练!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = TD3Config()
|
||||
plot_cfg = PlotConfig()
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(1) # 随机种子
|
||||
torch.manual_seed(1)
|
||||
np.random.seed(1)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
agent = TD3(n_states,n_actions,max_action,cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(plot_cfg.result_path,plot_cfg.model_path)
|
||||
agent.save(path=plot_cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='train',path=plot_cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,plot_cfg,tag="train")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user