Files
easy-rl/codes/A2C/task0.py
johnjim0816 0f38e23baf update
2022-07-21 00:13:44 +08:00

137 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import numpy as np
import torch
import torch.optim as optim
import datetime
import argparse
from common.multiprocessing_env import SubprocVecEnv
from a2c import ActorCritic
from common.utils import save_results, make_dir
from common.utils import plot_rewards, save_args
def get_args():
""" Hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--n_envs',default=8,type=int,help="numbers of environments")
parser.add_argument('--max_steps',default=20000,type=int,help="episodes of training")
parser.add_argument('--n_steps',default=5,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=1e-3,type=float,help="learning rate")
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
def make_envs(env_name):
def _thunk():
env = gym.make(env_name)
env.seed(2)
return env
return _thunk
def test_env(env,model,vis=False):
state = env.reset()
if vis: env.render()
done = False
total_reward = 0
while not done:
state = torch.FloatTensor(state).unsqueeze(0).to(cfg.device)
dist, _ = model(state)
next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
state = next_state
if vis: env.render()
total_reward += reward
return total_reward
def compute_returns(next_value, rewards, masks, gamma=0.99):
R = next_value
returns = []
for step in reversed(range(len(rewards))):
R = rewards[step] + gamma * R * masks[step]
returns.insert(0, R)
return returns
def train(cfg,envs):
print('Start training!')
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
env = gym.make(cfg.env_name) # a single env
env.seed(10)
n_states = envs.observation_space.shape[0]
n_actions = envs.action_space.n
model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
optimizer = optim.Adam(model.parameters())
step_idx = 0
test_rewards = []
test_ma_rewards = []
state = envs.reset()
while step_idx < cfg.max_steps:
log_probs = []
values = []
rewards = []
masks = []
entropy = 0
# rollout trajectory
for _ in range(cfg.n_steps):
state = torch.FloatTensor(state).to(cfg.device)
dist, value = model(state)
action = dist.sample()
next_state, reward, done, _ = envs.step(action.cpu().numpy())
log_prob = dist.log_prob(action)
entropy += dist.entropy().mean()
log_probs.append(log_prob)
values.append(value)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device))
state = next_state
step_idx += 1
if step_idx % 100 == 0:
test_reward = np.mean([test_env(env,model) for _ in range(10)])
print(f"step_idx:{step_idx}, test_reward:{test_reward}")
test_rewards.append(test_reward)
if test_ma_rewards:
test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward)
else:
test_ma_rewards.append(test_reward)
# plot(step_idx, test_rewards)
next_state = torch.FloatTensor(next_state).to(cfg.device)
_, next_value = model(next_state)
returns = compute_returns(next_value, rewards, masks)
log_probs = torch.cat(log_probs)
returns = torch.cat(returns).detach()
values = torch.cat(values)
advantage = returns - values
actor_loss = -(log_probs * advantage.detach()).mean()
critic_loss = advantage.pow(2).mean()
loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('Finish training')
return test_rewards, test_ma_rewards
if __name__ == "__main__":
cfg = get_args()
envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)]
envs = SubprocVecEnv(envs)
# training
rewards,ma_rewards = train(cfg,envs)
make_dir(cfg.result_path,cfg.model_path)
save_args(cfg)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果