import sys,os curr_path = os.path.dirname(os.path.abspath(__file__)) # current path parent_path = os.path.dirname(curr_path) # parent path sys.path.append(parent_path) # add to system path import gym import numpy as np import torch import torch.optim as optim import datetime import argparse from common.multiprocessing_env import SubprocVecEnv from a2c import ActorCritic from common.utils import save_results, make_dir from common.utils import plot_rewards, save_args def get_args(): """ Hyperparameters """ curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time parser = argparse.ArgumentParser(description="hyperparameters") parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm") parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") parser.add_argument('--n_envs',default=8,type=int,help="numbers of environments") parser.add_argument('--max_steps',default=20000,type=int,help="episodes of training") parser.add_argument('--n_steps',default=5,type=int,help="episodes of testing") parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") parser.add_argument('--lr',default=1e-3,type=float,help="learning rate") parser.add_argument('--hidden_dim',default=256,type=int) parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ '/' + curr_time + '/results/' ) parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ '/' + curr_time + '/models/' ) # path to save models parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") args = parser.parse_args() return args def make_envs(env_name): def _thunk(): env = gym.make(env_name) env.seed(2) return env return _thunk def test_env(env,model,vis=False): state = env.reset() if vis: env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(cfg.device) dist, _ = model(state) next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: env.render() total_reward += reward return total_reward def compute_returns(next_value, rewards, masks, gamma=0.99): R = next_value returns = [] for step in reversed(range(len(rewards))): R = rewards[step] + gamma * R * masks[step] returns.insert(0, R) return returns def train(cfg,envs): print('Start training!') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}') env = gym.make(cfg.env_name) # a single env env.seed(10) n_states = envs.observation_space.shape[0] n_actions = envs.action_space.n model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) optimizer = optim.Adam(model.parameters()) step_idx = 0 test_rewards = [] test_ma_rewards = [] state = envs.reset() while step_idx < cfg.max_steps: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 # rollout trajectory for _ in range(cfg.n_steps): state = torch.FloatTensor(state).to(cfg.device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device)) state = next_state step_idx += 1 if step_idx % 100 == 0: test_reward = np.mean([test_env(env,model) for _ in range(10)]) print(f"step_idx:{step_idx}, test_reward:{test_reward}") test_rewards.append(test_reward) if test_ma_rewards: test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward) else: test_ma_rewards.append(test_reward) # plot(step_idx, test_rewards) next_state = torch.FloatTensor(next_state).to(cfg.device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() print('Finish training!') return test_rewards, test_ma_rewards if __name__ == "__main__": cfg = get_args() envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)] envs = SubprocVecEnv(envs) # training rewards,ma_rewards = train(cfg,envs) make_dir(cfg.result_path,cfg.model_path) save_args(cfg) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) # 保存结果 plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果