This commit is contained in:
johnjim0816
2021-09-27 03:44:29 +08:00
parent 1e60b688fc
commit fb2affb69e
18 changed files with 191 additions and 24 deletions

167
codes/GAE/task0_train.py Normal file
View File

@@ -0,0 +1,167 @@
import math
import random
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
import seaborn as sns
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加父路径到系统路径sys.path
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
from common.multiprocessing_env import SubprocVecEnv
num_envs = 16
env_name = "Pendulum-v0"
def make_env():
def _thunk():
env = gym.make(env_name)
return env
return _thunk
envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)
env = gym.make(env_name)
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, mean=0., std=0.1)
nn.init.constant_(m.bias, 0.1)
class ActorCritic(nn.Module):
def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(num_inputs, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, 1)
)
self.actor = nn.Sequential(
nn.Linear(num_inputs, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, num_outputs),
)
self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
self.apply(init_weights)
def forward(self, x):
value = self.critic(x)
mu = self.actor(x)
std = self.log_std.exp().expand_as(mu)
dist = Normal(mu, std)
return dist, value
def plot(frame_idx, rewards):
plt.figure(figsize=(20,5))
plt.subplot(131)
plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
plt.plot(rewards)
plt.show()
def test_env(vis=False):
state = env.reset()
if vis: env.render()
done = False
total_reward = 0
while not done:
state = torch.FloatTensor(state).unsqueeze(0).to(device)
dist, _ = model(state)
next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
state = next_state
if vis: env.render()
total_reward += reward
return total_reward
def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
values = values + [next_value]
gae = 0
returns = []
for step in reversed(range(len(rewards))):
delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
gae = delta + gamma * tau * masks[step] * gae
returns.insert(0, gae + values[step])
return returns
num_inputs = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]
#Hyper params:
hidden_size = 256
lr = 3e-2
num_steps = 20
model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
optimizer = optim.Adam(model.parameters())
max_frames = 100000
frame_idx = 0
test_rewards = []
state = envs.reset()
while frame_idx < max_frames:
log_probs = []
values = []
rewards = []
masks = []
entropy = 0
for _ in range(num_steps):
state = torch.FloatTensor(state).to(device)
dist, value = model(state)
action = dist.sample()
next_state, reward, done, _ = envs.step(action.cpu().numpy())
log_prob = dist.log_prob(action)
entropy += dist.entropy().mean()
log_probs.append(log_prob)
values.append(value)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
state = next_state
frame_idx += 1
if frame_idx % 1000 == 0:
test_rewards.append(np.mean([test_env() for _ in range(10)]))
print(test_rewards[-1])
# plot(frame_idx, test_rewards)
next_state = torch.FloatTensor(next_state).to(device)
_, next_value = model(next_state)
returns = compute_gae(next_value, rewards, masks, values)
log_probs = torch.cat(log_probs)
returns = torch.cat(returns).detach()
values = torch.cat(values)
advantage = returns - values
actor_loss = -(log_probs * advantage.detach()).mean()
critic_loss = advantage.pow(2).mean()
loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
optimizer.zero_grad()
loss.backward()
optimizer.step()

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-23 15:17:42 Date: 2021-03-23 15:17:42
LastEditor: John LastEditor: John
LastEditTime: 2021-04-28 10:11:09 LastEditTime: 2021-09-26 22:02:00
Discription: Discription:
Environment: Environment:
''' '''
@@ -41,10 +41,8 @@ class PPO:
def update(self): def update(self):
for _ in range(self.n_epochs): for _ in range(self.n_epochs):
state_arr, action_arr, old_prob_arr, vals_arr,\ state_arr, action_arr, old_prob_arr, vals_arr,reward_arr, dones_arr, batches = self.memory.sample()
reward_arr, dones_arr, batches = \ values = vals_arr[:]
self.memory.sample()
values = vals_arr
### compute advantage ### ### compute advantage ###
advantage = np.zeros(len(reward_arr), dtype=np.float32) advantage = np.zeros(len(reward_arr), dtype=np.float32)
for t in range(len(reward_arr)-1): for t in range(len(reward_arr)-1):

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-23 15:30:46 Date: 2021-03-23 15:30:46
LastEditor: John LastEditor: John
LastEditTime: 2021-03-23 15:30:55 LastEditTime: 2021-09-26 22:00:07
Discription: Discription:
Environment: Environment:
''' '''
@@ -24,13 +24,8 @@ class PPOMemory:
indices = np.arange(len(self.states), dtype=np.int64) indices = np.arange(len(self.states), dtype=np.int64)
np.random.shuffle(indices) np.random.shuffle(indices)
batches = [indices[i:i+self.batch_size] for i in batch_step] batches = [indices[i:i+self.batch_size] for i in batch_step]
return np.array(self.states),\ return np.array(self.states),np.array(self.actions),np.array(self.probs),\
np.array(self.actions),\ np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
np.array(self.probs),\
np.array(self.vals),\
np.array(self.rewards),\
np.array(self.dones),\
batches
def push(self, state, action, probs, vals, reward, done): def push(self, state, action, probs, vals, reward, done):
self.states.append(state) self.states.append(state)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-22 16:18:10 Date: 2021-03-22 16:18:10
LastEditor: John LastEditor: John
LastEditTime: 2021-05-06 00:43:36 LastEditTime: 2021-09-26 22:05:00
Discription: Discription:
Environment: Environment:
''' '''
@@ -17,6 +17,7 @@ sys.path.append(parent_path) # add current terminal path to sys.path
import gym import gym
import torch import torch
import datetime import datetime
import tqdm
from PPO.agent import PPO from PPO.agent import PPO
from common.plot import plot_rewards from common.plot import plot_rewards
from common.utils import save_results,make_dir from common.utils import save_results,make_dir
@@ -51,7 +52,7 @@ def env_agent_config(cfg,seed=1):
return env,agent return env,agent
def train(cfg,env,agent): def train(cfg,env,agent):
print('Start to train !') print('开始训练!')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards= [] rewards= []
ma_rewards = [] # moving average rewards ma_rewards = [] # moving average rewards
@@ -75,7 +76,7 @@ def train(cfg,env,agent):
0.9*ma_rewards[-1]+0.1*ep_reward) 0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}") print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('Complete training') print('Complete training')
return rewards,ma_rewards return rewards,ma_rewards

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00 Date: 2020-09-11 23:03:00
LastEditor: John LastEditor: John
LastEditTime: 2021-09-20 00:32:59 LastEditTime: 2021-09-23 12:22:58
Discription: Discription:
Environment: Environment:
''' '''
@@ -34,7 +34,7 @@ class QlearningConfig:
self.train_eps = 400 # 训练的回合数 self.train_eps = 400 # 训练的回合数
self.eval_eps = 30 # 测试的回合数 self.eval_eps = 30 # 测试的回合数
self.gamma = 0.9 # reward的衰减率 self.gamma = 0.9 # reward的衰减率
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率 self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
self.lr = 0.1 # 学习率 self.lr = 0.1 # 学习率
@@ -53,14 +53,15 @@ def env_agent_config(cfg,seed=1):
def train(cfg,env,agent): def train(cfg,env,agent):
print('开始训练!') print('开始训练!')
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] rewards = [] # 记录奖励
ma_rewards = [] # 滑动平均奖励 ma_rewards = [] # 记录滑动平均奖励
for i_ep in range(cfg.train_eps): for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励 ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合 state = env.reset() # 重置环境,即开始新的回合
while True: while True:
action = agent.choose_action(state) # 根据算法选择一个动作 action = agent.choose_action(state) # 根据算法选择一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互 next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
print(reward)
agent.update(state, action, reward, next_state, done) # Q学习算法更新 agent.update(state, action, reward, next_state, done) # Q学习算法更新
state = next_state # 更新状态 state = next_state # 更新状态
ep_reward += reward ep_reward += reward
@@ -78,6 +79,8 @@ def train(cfg,env,agent):
def eval(cfg,env,agent): def eval(cfg,env,agent):
print('开始测试!') print('开始测试!')
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
for item in agent.Q_table.items():
print(item)
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 滑动平均的奖励 ma_rewards = [] # 滑动平均的奖励
for i_ep in range(cfg.eval_eps): for i_ep in range(cfg.eval_eps):
@@ -86,7 +89,7 @@ def eval(cfg,env,agent):
while True: while True:
action = agent.predict(state) # 根据算法选择一个动作 action = agent.predict(state) # 根据算法选择一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互 next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
state = next_state # 存储上一个观察值 state = next_state # 更新状态
ep_reward += reward ep_reward += reward
if done: if done:
break break
@@ -103,10 +106,12 @@ if __name__ == "__main__":
cfg = QlearningConfig() cfg = QlearningConfig()
# 训练 # 训练
env,agent = env_agent_config(cfg,seed=1) env,agent = env_agent_config(cfg,seed=0)
rewards,ma_rewards = train(cfg,env,agent) rewards,ma_rewards = train(cfg,env,agent)
make_dir(cfg.result_path,cfg.model_path) # 创建文件夹 make_dir(cfg.result_path,cfg.model_path) # 创建文件夹
agent.save(path=cfg.model_path) # 保存模型 agent.save(path=cfg.model_path) # 保存模型
for item in agent.Q_table.items():
print(item)
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果 save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果
plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
@@ -114,6 +119,7 @@ if __name__ == "__main__":
env,agent = env_agent_config(cfg,seed=10) env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path) # 加载模型 agent.load(path=cfg.model_path) # 加载模型
rewards,ma_rewards = eval(cfg,env,agent) rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-10-07 20:57:11 Date: 2020-10-07 20:57:11
LastEditor: John LastEditor: John
LastEditTime: 2021-09-19 23:00:36 LastEditTime: 2021-09-23 12:23:01
Discription: Discription:
Environment: Environment:
''' '''