Files
easy-rl/codes/A2C/main.py
JohnJim0816 8ac41fb16b add A2C
2020-11-09 21:24:23 +08:00

187 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2020-11-08 22:19:56
@Discription:
@Environment: python 3.7.9
'''
import torch
import gym
import os
import numpy as np
import argparse
from torch.utils.tensorboard import SummaryWriter
from agent import A2C
from env import make_envs
from utils import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH
from utils import save_model,save_results
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # reward 折扣因子
parser.add_argument("--lr", default=3e-4, type=float) # critic学习率
parser.add_argument("--actor_lr", default=1e-4, type=float)
parser.add_argument("--memory_capacity", default=10000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=4000, type=int)
parser.add_argument("--train_steps", default=5, type=int)
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=4, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
def test_env(agent,device='cpu'):
env = gym.make("CartPole-v0")
state = env.reset()
ep_reward=0
for _ in range(200):
state = torch.FloatTensor(state).unsqueeze(0).to(device)
dist, value = agent.model(state)
action = dist.sample()
next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
state = next_state
ep_reward += reward
if done:
break
return ep_reward
def train(cfg):
print('Start to train ! \n')
envs = make_envs(num_envs=16,env_name="CartPole-v0")
n_states = envs.observation_space.shape[0]
n_actions = envs.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = A2C(n_states, n_actions, hidden_dim=256)
# moving_average_rewards = []
# ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
state = envs.reset()
for i_episode in range(1, cfg.train_eps+1):
log_probs = []
values = []
rewards = []
masks = []
entropy = 0
for i_step in range(1, cfg.train_steps+1):
state = torch.FloatTensor(state).to(device)
dist, value = agent.model(state)
action = dist.sample()
next_state, reward, done, _ = envs.step(action.cpu().numpy())
state = next_state
log_prob = dist.log_prob(action)
entropy += dist.entropy().mean()
log_probs.append(log_prob)
values.append(value)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
if i_episode%20 == 0:
print("reward",test_env(agent,device='cpu'))
next_state = torch.FloatTensor(next_state).to(device)
_, next_value =agent.model(next_state)
returns = agent.compute_returns(next_value, rewards, masks)
log_probs = torch.cat(log_probs)
returns = torch.cat(returns).detach()
values = torch.cat(values)
advantage = returns - values
actor_loss = -(log_probs * advantage.detach()).mean()
critic_loss = advantage.pow(2).mean()
loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
agent.optimizer.zero_grad()
loss.backward()
agent.optimizer.step()
for _ in range(100):
print("test_reward",test_env(agent,device='cpu'))
# print('Episode:', i_episode, ' Reward: %i' %
# int(ep_reward[0]), 'n_steps:', i_step)
# ep_steps.append(i_step)
# rewards.append(ep_reward)
# if i_episode == 1:
# moving_average_rewards.append(ep_reward[0])
# else:
# moving_average_rewards.append(
# 0.9*moving_average_rewards[-1]+0.1*ep_reward[0])
# writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
# writer.add_scalar('steps_of_each_episode',
# ep_steps[-1], i_episode)
writer.close()
print('Complete training')
''' 保存模型 '''
# save_model(agent,model_path=SAVED_MODEL_PATH)
# '''存储reward等相关结果'''
# save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
# def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
# print('start to eval ! \n')
# env = NormalizedActions(gym.make("Pendulum-v0"))
# n_states = env.observation_space.shape[0]
# n_actions = env.action_space.shape[0]
# agent = DDPG(n_states, n_actions, critic_lr=1e-3,
# actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
# agent.load_model(saved_model_path+'checkpoint.pth')
# rewards = []
# moving_average_rewards = []
# ep_steps = []
# log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
# writer = SummaryWriter(log_dir)
# for i_episode in range(1, cfg.eval_eps+1):
# state = env.reset() # reset环境状态
# ep_reward = 0
# for i_step in range(1, cfg.eval_steps+1):
# action = agent.choose_action(state) # 根据当前环境state选择action
# next_state, reward, done, _ = env.step(action) # 更新环境参数
# ep_reward += reward
# state = next_state # 跳转到下一个状态
# if done:
# break
# print('Episode:', i_episode, ' Reward: %i' %
# int(ep_reward), 'n_steps:', i_step, 'done: ', done)
# ep_steps.append(i_step)
# rewards.append(ep_reward)
# # 计算滑动窗口的reward
# if i_episode == 1:
# moving_average_rewards.append(ep_reward)
# else:
# moving_average_rewards.append(
# 0.9*moving_average_rewards[-1]+0.1*ep_reward)
# writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
# writer.add_scalar('steps_of_each_episode',
# ep_steps[-1], i_episode)
# writer.close()
# '''存储reward等相关结果'''
# if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
# os.mkdir(RESULT_PATH)
# np.save(RESULT_PATH+'rewards_eval.npy', rewards)
# np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards)
# np.save(RESULT_PATH+'steps_eval.npy', ep_steps)
if __name__ == "__main__":
cfg = get_args()
train(cfg)
# cfg = get_args()
# if cfg.train:
# train(cfg)
# eval(cfg)
# else:
# model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
# eval(cfg,saved_model_path=model_path)