update codes
This commit is contained in:
@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
|
||||
class A2C:
|
||||
''' A2C算法
|
||||
'''
|
||||
def __init__(self,state_dim,action_dim,cfg) -> None:
|
||||
def __init__(self,n_states,n_actions,cfg) -> None:
|
||||
self.gamma = cfg.gamma
|
||||
self.device = cfg.device
|
||||
self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
|
||||
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
|
||||
self.optimizer = optim.Adam(self.model.parameters())
|
||||
|
||||
def compute_returns(self,next_value, rewards, masks):
|
||||
|
||||
@@ -74,9 +74,9 @@ def train(cfg,envs):
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
||||
env = gym.make(cfg.env_name) # a single env
|
||||
env.seed(10)
|
||||
state_dim = envs.observation_space.shape[0]
|
||||
action_dim = envs.action_space.n
|
||||
model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
|
||||
n_states = envs.observation_space.shape[0]
|
||||
n_actions = envs.action_space.n
|
||||
model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
|
||||
optimizer = optim.Adam(model.parameters())
|
||||
frame_idx = 0
|
||||
test_rewards = []
|
||||
|
||||
@@ -39,15 +39,15 @@ class OUNoise(object):
|
||||
self.max_sigma = max_sigma
|
||||
self.min_sigma = min_sigma
|
||||
self.decay_period = decay_period
|
||||
self.action_dim = action_space.shape[0]
|
||||
self.n_actions = action_space.shape[0]
|
||||
self.low = action_space.low
|
||||
self.high = action_space.high
|
||||
self.reset()
|
||||
def reset(self):
|
||||
self.obs = np.ones(self.action_dim) * self.mu
|
||||
self.obs = np.ones(self.n_actions) * self.mu
|
||||
def evolve_obs(self):
|
||||
x = self.obs
|
||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
|
||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
|
||||
self.obs = x + dx
|
||||
return self.obs
|
||||
def get_action(self, action, t=0):
|
||||
|
||||
@@ -50,15 +50,15 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
class FCN(nn.Module):
|
||||
def __init__(self, state_dim=4, action_dim=18):
|
||||
def __init__(self, n_states=4, n_actions=18):
|
||||
""" 初始化q网络,为全连接网络
|
||||
state_dim: 输入的feature即环境的state数目
|
||||
action_dim: 输出的action总个数
|
||||
n_states: 输入的feature即环境的state数目
|
||||
n_actions: 输出的action总个数
|
||||
"""
|
||||
super(FCN, self).__init__()
|
||||
self.fc1 = nn.Linear(state_dim, 128) # 输入层
|
||||
self.fc1 = nn.Linear(n_states, 128) # 输入层
|
||||
self.fc2 = nn.Linear(128, 128) # 隐藏层
|
||||
self.fc3 = nn.Linear(128, action_dim) # 输出层
|
||||
self.fc3 = nn.Linear(128, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
@@ -66,7 +66,7 @@ class FCN(nn.Module):
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
```
|
||||
输入为state_dim,输出为action_dim,包含一个128维度的隐藏层,这里根据需要可增加隐藏层维度和数量,然后一般使用relu激活函数,这里跟深度学习的网路设置是一样的。
|
||||
输入为n_states,输出为n_actions,包含一个128维度的隐藏层,这里根据需要可增加隐藏层维度和数量,然后一般使用relu激活函数,这里跟深度学习的网路设置是一样的。
|
||||
|
||||
### Replay Buffer
|
||||
|
||||
@@ -107,8 +107,8 @@ class ReplayBuffer:
|
||||
在类中建立两个网络,以及optimizer和memory,
|
||||
|
||||
```python
|
||||
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
||||
@@ -124,7 +124,7 @@ def choose_action(self, state):
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
action = self.predict(state)
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
```
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2021-09-15 13:35:36
|
||||
LastEditTime: 2021-12-22 14:01:37
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -21,15 +21,15 @@ import math
|
||||
import numpy as np
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, state_dim,action_dim,hidden_dim=128):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
state_dim: 输入的特征数即环境的状态数
|
||||
action_dim: 输出的动作维度
|
||||
n_states: 输入的特征数即环境的状态数
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
|
||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
@@ -62,9 +62,9 @@ class ReplayBuffer:
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, state_dim, action_dim, cfg):
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
|
||||
self.action_dim = action_dim # 总的动作个数
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
@@ -73,8 +73,8 @@ class DQN:
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
@@ -90,7 +90,7 @@ class DQN:
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
133
codes/DQN/dqn_cnn.py
Normal file
133
codes/DQN/dqn_cnn.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.autograd as autograd
|
||||
import random
|
||||
import math
|
||||
class CNN(nn.Module):
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(CNN, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
self.features = nn.Sequential(
|
||||
nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(32, 64, kernel_size=4, stride=2),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(64, 64, kernel_size=3, stride=1),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
self.fc = nn.Sequential(
|
||||
nn.Linear(self.feature_size(), 512),
|
||||
nn.ReLU(),
|
||||
nn.Linear(512, self.output_dim)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.features(x)
|
||||
x = x.view(x.size(0), -1)
|
||||
x = self.fc(x)
|
||||
return x
|
||||
|
||||
def feature_size(self):
|
||||
return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1)
|
||||
|
||||
|
||||
def act(self, state, epsilon):
|
||||
if random.random() > epsilon:
|
||||
state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
|
||||
q_value = self.forward(state)
|
||||
action = q_value.max(1)[1].data[0]
|
||||
else:
|
||||
action = random.randrange(env.action_space.n)
|
||||
return action
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
self.frame_idx = 0 # 用于epsilon的衰减计数
|
||||
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = CNN(n_states, n_actions).to(self.device)
|
||||
self.target_net = CNN(n_states, n_actions).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
|
||||
|
||||
def choose_action(self, state):
|
||||
''' 选择动作
|
||||
'''
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
return
|
||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
# 转为张量
|
||||
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
|
||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
|
||||
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
|
||||
# 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
|
||||
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
|
||||
# 优化更新模型
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
|
||||
|
||||
def load(self, path):
|
||||
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
@@ -9,11 +9,10 @@ import torch
|
||||
import datetime
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards
|
||||
from DQN.agent import DQN
|
||||
from DQN.train import train,test
|
||||
from DQN.dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
algo_name = "DQN" # 算法名称
|
||||
algo_name = 'DQN' # 算法名称
|
||||
env_name = 'CartPole-v0' # 环境名称
|
||||
|
||||
class DQNConfig:
|
||||
@@ -51,25 +50,82 @@ def env_agent_config(cfg, seed=1):
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
env.seed(seed) # 设置随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
|
||||
n_states = env.observation_space.shape[0] # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward, next_state, done) # 保存transition
|
||||
state = next_state # 更新下一个状态
|
||||
agent.update() # 更新智能体
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep+1)%10 == 0:
|
||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
||||
print('完成训练!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
cfg = DQNConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return rewards,ma_rewards
|
||||
if __name__ == "__main__":
|
||||
cfg = DQNConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
|
||||
@@ -1,3 +1,13 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-12-22 11:14:17
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-12-22 11:40:44
|
||||
Discription: 使用 Nature DQN 训练 CartPole-v1
|
||||
'''
|
||||
import sys
|
||||
import os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
@@ -9,9 +19,7 @@ import torch
|
||||
import datetime
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards, plot_rewards_cn
|
||||
from DQN.agent import DQN
|
||||
from DQN.train import train,test
|
||||
|
||||
from DQN.dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
algo_name = "DQN" # 算法名称
|
||||
@@ -58,26 +66,83 @@ def env_agent_config(cfg, seed=1):
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
env.seed(seed) # 设置随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
|
||||
n_states = env.observation_space.shape[0] # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward, next_state, done) # 保存transition
|
||||
state = next_state # 更新下一个状态
|
||||
agent.update() # 更新智能体
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep+1)%10 == 0:
|
||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
||||
print('完成训练!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
cfg = DQNConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return rewards,ma_rewards
|
||||
if __name__ == "__main__":
|
||||
cfg = DQNConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
|
||||
150
codes/DQN/task2.py
Normal file
150
codes/DQN/task2.py
Normal file
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-12-22 11:14:17
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-12-22 15:27:48
|
||||
Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4
|
||||
'''
|
||||
import sys
|
||||
import os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards, plot_rewards_cn
|
||||
from common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from DQN.dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
algo_name = 'DQN-cnn' # 算法名称
|
||||
env_name = 'PongNoFrameskip-v4' # 环境名称
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||
class DQNConfig:
|
||||
''' 算法相关参数设置
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
self.algo_name = algo_name # 算法名称
|
||||
self.env_name = env_name # 环境名称
|
||||
self.device = device # 检测GPU
|
||||
self.train_eps = 500 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
# 超参数
|
||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
|
||||
self.lr = 0.0001 # 学习率
|
||||
self.memory_capacity = 100000 # 经验回放的容量
|
||||
self.batch_size = 64 # mini-batch SGD中的批量大小
|
||||
self.target_update = 4 # 目标网络的更新频率
|
||||
self.hidden_dim = 256 # 网络隐藏层
|
||||
class PlotConfig:
|
||||
''' 绘图相关参数设置
|
||||
'''
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.algo_name = algo_name # 算法名称
|
||||
self.env_name = env_name # 环境名称
|
||||
self.device = device # 检测GPU
|
||||
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/results/' # 保存结果的路径
|
||||
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
|
||||
|
||||
def env_agent_config(cfg, seed=1):
|
||||
''' 创建环境和智能体
|
||||
'''
|
||||
env = make_atari(cfg.env_name) # 创建环境
|
||||
# env = wrap_deepmind(env)
|
||||
# env = wrap_pytorch(env)
|
||||
env.seed(seed) # 设置随机种子
|
||||
n_states = env.observation_space.shape[0] # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward, next_state, done) # 保存transition
|
||||
state = next_state # 更新下一个状态
|
||||
agent.update() # 更新智能体
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep+1)%10 == 0:
|
||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
||||
print('完成训练!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return rewards,ma_rewards
|
||||
if __name__ == "__main__":
|
||||
cfg = DQNConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
File diff suppressed because one or more lines are too long
@@ -1,138 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
@Author: John
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:48:57
|
||||
@LastEditor: John
|
||||
LastEditTime: 2021-12-22 11:08:04
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward, next_state, done) # 保存transition
|
||||
state = next_state # 更新下一个状态
|
||||
agent.update() # 更新智能体
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep+1)%10 == 0:
|
||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
||||
print('完成训练!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return rewards,ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards
|
||||
from DQN.agent import DQN
|
||||
from DQN.train import train
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
class DQNConfig:
|
||||
def __init__(self):
|
||||
self.algo = "DQN" # 算法名称
|
||||
self.env_name = 'CartPole-v0' # 环境名称
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||
self.train_eps = 200 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
# 超参数
|
||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
|
||||
self.lr = 0.0001 # 学习率
|
||||
self.memory_capacity = 100000 # 经验回放的容量
|
||||
self.batch_size = 64 # mini-batch SGD中的批量大小
|
||||
self.target_update = 4 # 目标网络的更新频率
|
||||
self.hidden_dim = 256 # 网络隐藏层
|
||||
class PlotConfig:
|
||||
def __init__(self) -> None:
|
||||
self.algo = "DQN" # 算法名称
|
||||
self.env_name = 'CartPole-v0' # 环境名称
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||
self.result_path = curr_path+"/outputs/" + self.env_name + \
|
||||
'/'+curr_time+'/results/' # 保存结果的路径
|
||||
self.model_path = curr_path+"/outputs/" + self.env_name + \
|
||||
'/'+curr_time+'/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
''' 创建环境和智能体
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
env.seed(seed) # 设置随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
agent = DQN(state_dim,action_dim,cfg) # 创建智能体
|
||||
return env,agent
|
||||
|
||||
cfg = DQNConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# 训练
|
||||
env,agent = env_agent_config(cfg,seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env,agent = env_agent_config(cfg,seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards,ma_rewards = test(cfg,env,agent)
|
||||
save_results(rewards,ma_rewards,tag='test',path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards,ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
@@ -90,15 +90,15 @@ class OUNoise(object):
|
||||
self.max_sigma = max_sigma
|
||||
self.min_sigma = min_sigma
|
||||
self.decay_period = decay_period
|
||||
self.action_dim = action_space.shape[0]
|
||||
self.n_actions = action_space.shape[0]
|
||||
self.low = action_space.low
|
||||
self.high = action_space.high
|
||||
self.reset()
|
||||
def reset(self):
|
||||
self.obs = np.ones(self.action_dim) * self.mu
|
||||
self.obs = np.ones(self.n_actions) * self.mu
|
||||
def evolve_obs(self):
|
||||
x = self.obs
|
||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
|
||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
|
||||
self.obs = x + dx
|
||||
return self.obs
|
||||
def get_action(self, action, t=0):
|
||||
|
||||
@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0
|
||||
import gym
|
||||
env = gym.make('CartPole-v0') # 建立环境
|
||||
env.seed(1) # 随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
n_states = env.observation_space.shape[0] # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
state = env.reset() # 初始化环境
|
||||
print(f"状态数:{state_dim},动作数:{action_dim}")
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
print(f"初始状态:{state}")
|
||||
```
|
||||
|
||||
@@ -157,7 +157,7 @@ def choose_action(self, state):
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
```
|
||||
|
||||
可以看到跟Q学习算法其实是一样的,都是用的$\epsilon-greedy$策略,只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
|
||||
|
||||
@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
|
||||
这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作数目:
|
||||
|
||||
```python
|
||||
state_dim = env.observation_space.n # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
print(f"状态数:{state_dim},动作数:{action_dim}")
|
||||
n_states = env.observation_space.n # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
```
|
||||
|
||||
打印出来的结果如下:
|
||||
@@ -72,9 +72,9 @@ print(state)
|
||||
env = gym.make('CliffWalking-v0') # 定义环境
|
||||
env = CliffWalkingWapper(env) # 装饰环境
|
||||
env.seed(1) # 设置随机种子
|
||||
state_dim = env.observation_space.n # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
|
||||
n_states = env.observation_space.n # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
|
||||
for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
|
||||
ep_reward = 0 # 记录每个回合的奖励
|
||||
state = env.reset() # 重置环境
|
||||
@@ -126,7 +126,7 @@ def choose_action(self, state):
|
||||
if np.random.uniform(0, 1) > self.epsilon:
|
||||
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
|
||||
else:
|
||||
action = np.random.choice(self.action_dim) # 随机选择动作
|
||||
action = np.random.choice(self.n_actions) # 随机选择动作
|
||||
return action
|
||||
```
|
||||
|
||||
|
||||
@@ -136,12 +136,12 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class DuelingNet(nn.Module):\n",
|
||||
" def __init__(self, state_dim, action_dim,hidden_size=128):\n",
|
||||
" def __init__(self, n_states, n_actions,hidden_size=128):\n",
|
||||
" super(DuelingNet, self).__init__()\n",
|
||||
" \n",
|
||||
" # 隐藏层\n",
|
||||
" self.hidden = nn.Sequential(\n",
|
||||
" nn.Linear(state_dim, hidden_size),\n",
|
||||
" nn.Linear(n_states, hidden_size),\n",
|
||||
" nn.ReLU()\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
@@ -149,7 +149,7 @@
|
||||
" self.advantage = nn.Sequential(\n",
|
||||
" nn.Linear(hidden_size, hidden_size),\n",
|
||||
" nn.ReLU(),\n",
|
||||
" nn.Linear(hidden_size, action_dim)\n",
|
||||
" nn.Linear(hidden_size, n_actions)\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # 价值函数\n",
|
||||
@@ -192,7 +192,7 @@
|
||||
],
|
||||
"source": [
|
||||
"class DuelingDQN:\n",
|
||||
" def __init__(self,state_dim,action_dim,cfg) -> None:\n",
|
||||
" def __init__(self,n_states,n_actions,cfg) -> None:\n",
|
||||
" self.batch_size = cfg.batch_size\n",
|
||||
" self.device = cfg.device\n",
|
||||
" self.loss_history = [] # 记录loss的变化\n",
|
||||
@@ -200,8 +200,8 @@
|
||||
" self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
|
||||
" (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
|
||||
" math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
|
||||
" self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
|
||||
" self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
|
||||
" self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
|
||||
" self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
|
||||
" for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
|
||||
" target_param.data.copy_(param.data)\n",
|
||||
" self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
|
||||
@@ -214,7 +214,7 @@
|
||||
" q_values = self.policy_net(state)\n",
|
||||
" action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
|
||||
" else:\n",
|
||||
" action = random.randrange(self.action_dim)\n",
|
||||
" action = random.randrange(self.n_actions)\n",
|
||||
" return action\n",
|
||||
" def update(self):\n",
|
||||
" if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略\n",
|
||||
|
||||
5
codes/Logs.md
Normal file
5
codes/Logs.md
Normal file
@@ -0,0 +1,5 @@
|
||||
## 记录笔者更新的日志
|
||||
|
||||
**2021.12.22-3**:将```agent.py```更改为对应的算法名称,便于区分如```dqn```与```dqn_cnn```的情况
|
||||
**2021.12.22-2**:简化了代码结构,将原来的```train.py```和```task.py```等合并到```task.py```中
|
||||
**2021.12.22-1**:简化了代码结构,将原来的```model.py```和```memory.py```等合并到```agent.py```中,```plot.py```的内容合并到```common.utils.py```中
|
||||
@@ -17,11 +17,11 @@ import dill
|
||||
class FisrtVisitMC:
|
||||
''' On-Policy First-Visit MC Control
|
||||
'''
|
||||
def __init__(self,action_dim,cfg):
|
||||
self.action_dim = action_dim
|
||||
def __init__(self,n_actions,cfg):
|
||||
self.n_actions = n_actions
|
||||
self.epsilon = cfg.epsilon
|
||||
self.gamma = cfg.gamma
|
||||
self.Q_table = defaultdict(lambda: np.zeros(action_dim))
|
||||
self.Q_table = defaultdict(lambda: np.zeros(n_actions))
|
||||
self.returns_sum = defaultdict(float) # sum of returns
|
||||
self.returns_count = defaultdict(float)
|
||||
|
||||
@@ -29,11 +29,11 @@ class FisrtVisitMC:
|
||||
''' e-greed policy '''
|
||||
if state in self.Q_table.keys():
|
||||
best_action = np.argmax(self.Q_table[state])
|
||||
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
|
||||
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
|
||||
action_probs[best_action] += (1.0 - self.epsilon)
|
||||
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
||||
else:
|
||||
action = np.random.randint(0,self.action_dim)
|
||||
action = np.random.randint(0,self.n_actions)
|
||||
return action
|
||||
def update(self,one_ep_transition):
|
||||
# Find all (state, action) pairs we've visited in this one_ep_transition
|
||||
|
||||
@@ -43,8 +43,8 @@ class MCConfig:
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = RacetrackEnv()
|
||||
action_dim = 9
|
||||
agent = FisrtVisitMC(action_dim, cfg)
|
||||
n_actions = 9
|
||||
agent = FisrtVisitMC(n_actions, cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
|
||||
52
codes/NoisyDQN/noisy_dqn.py
Normal file
52
codes/NoisyDQN/noisy_dqn.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class NoisyLinear(nn.Module):
|
||||
def __init__(self, input_dim, output_dim, std_init=0.4):
|
||||
super(NoisyLinear, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
self.std_init = std_init
|
||||
|
||||
self.weight_mu = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
|
||||
self.weight_sigma = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
|
||||
self.register_buffer('weight_epsilon', torch.FloatTensor(output_dim, input_dim))
|
||||
|
||||
self.bias_mu = nn.Parameter(torch.FloatTensor(output_dim))
|
||||
self.bias_sigma = nn.Parameter(torch.FloatTensor(output_dim))
|
||||
self.register_buffer('bias_epsilon', torch.FloatTensor(output_dim))
|
||||
|
||||
self.reset_parameters()
|
||||
self.reset_noise()
|
||||
|
||||
def forward(self, x):
|
||||
if self.training:
|
||||
weight = self.weight_mu + self.weight_sigma.mul( (self.weight_epsilon))
|
||||
bias = self.bias_mu + self.bias_sigma.mul(Variable(self.bias_epsilon))
|
||||
else:
|
||||
weight = self.weight_mu
|
||||
bias = self.bias_mu
|
||||
|
||||
return F.linear(x, weight, bias)
|
||||
|
||||
def reset_parameters(self):
|
||||
mu_range = 1 / math.sqrt(self.weight_mu.size(1))
|
||||
|
||||
self.weight_mu.data.uniform_(-mu_range, mu_range)
|
||||
self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
|
||||
|
||||
self.bias_mu.data.uniform_(-mu_range, mu_range)
|
||||
self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
|
||||
|
||||
def reset_noise(self):
|
||||
epsilon_in = self._scale_noise(self.input_dim)
|
||||
epsilon_out = self._scale_noise(self.output_dim)
|
||||
|
||||
self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
|
||||
self.bias_epsilon.copy_(self._scale_noise(self.output_dim))
|
||||
|
||||
def _scale_noise(self, size):
|
||||
x = torch.randn(size)
|
||||
x = x.sign().mul(x.abs().sqrt())
|
||||
return x
|
||||
@@ -57,16 +57,16 @@ model就是actor和critic两个网络了:
|
||||
import torch.nn as nn
|
||||
from torch.distributions.categorical import Categorical
|
||||
class Actor(nn.Module):
|
||||
def __init__(self,state_dim, action_dim,
|
||||
def __init__(self,n_states, n_actions,
|
||||
hidden_dim=256):
|
||||
super(Actor, self).__init__()
|
||||
|
||||
self.actor = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim),
|
||||
nn.Linear(n_states, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, action_dim),
|
||||
nn.Linear(hidden_dim, n_actions),
|
||||
nn.Softmax(dim=-1)
|
||||
)
|
||||
def forward(self, state):
|
||||
@@ -75,10 +75,10 @@ class Actor(nn.Module):
|
||||
return dist
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, state_dim,hidden_dim=256):
|
||||
def __init__(self, n_states,hidden_dim=256):
|
||||
super(Critic, self).__init__()
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim),
|
||||
nn.Linear(n_states, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
@@ -88,7 +88,7 @@ class Critic(nn.Module):
|
||||
value = self.critic(state)
|
||||
return value
|
||||
```
|
||||
这里Actor就是得到一个概率分布(Categorica,也可以是别的分布,可以搜索torch distributionsl),critc根据当前状态得到一个值,这里的输入维度可以是```state_dim+action_dim```,即将action信息也纳入critic网络中,这样会更好一些,感兴趣的小伙伴可以试试。
|
||||
这里Actor就是得到一个概率分布(Categorica,也可以是别的分布,可以搜索torch distributionsl),critc根据当前状态得到一个值,这里的输入维度可以是```n_states+n_actions```,即将action信息也纳入critic网络中,这样会更好一些,感兴趣的小伙伴可以试试。
|
||||
|
||||
### PPO update
|
||||
定义一个update函数主要实现伪代码中的第六步和第七步:
|
||||
|
||||
@@ -16,15 +16,15 @@ import torch.optim as optim
|
||||
from PPO.model import Actor,Critic
|
||||
from PPO.memory import PPOMemory
|
||||
class PPO:
|
||||
def __init__(self, state_dim, action_dim,cfg):
|
||||
def __init__(self, n_states, n_actions,cfg):
|
||||
self.gamma = cfg.gamma
|
||||
self.continuous = cfg.continuous
|
||||
self.policy_clip = cfg.policy_clip
|
||||
self.n_epochs = cfg.n_epochs
|
||||
self.gae_lambda = cfg.gae_lambda
|
||||
self.device = cfg.device
|
||||
self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
|
||||
self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
|
||||
self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
|
||||
self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
|
||||
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
|
||||
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
|
||||
self.memory = PPOMemory(cfg.batch_size)
|
||||
|
||||
@@ -12,16 +12,16 @@ Environment:
|
||||
import torch.nn as nn
|
||||
from torch.distributions.categorical import Categorical
|
||||
class Actor(nn.Module):
|
||||
def __init__(self,state_dim, action_dim,
|
||||
def __init__(self,n_states, n_actions,
|
||||
hidden_dim):
|
||||
super(Actor, self).__init__()
|
||||
|
||||
self.actor = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim),
|
||||
nn.Linear(n_states, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, action_dim),
|
||||
nn.Linear(hidden_dim, n_actions),
|
||||
nn.Softmax(dim=-1)
|
||||
)
|
||||
def forward(self, state):
|
||||
@@ -30,10 +30,10 @@ class Actor(nn.Module):
|
||||
return dist
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, state_dim,hidden_dim):
|
||||
def __init__(self, n_states,hidden_dim):
|
||||
super(Critic, self).__init__()
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim),
|
||||
nn.Linear(n_states, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
|
||||
@@ -45,9 +45,9 @@ class PlotConfig:
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
agent = PPO(state_dim,action_dim,cfg)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
agent = PPO(n_states,n_actions,cfg)
|
||||
return env,agent
|
||||
|
||||
cfg = PPOConfig()
|
||||
|
||||
@@ -45,9 +45,9 @@ class PlotConfig:
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
agent = PPO(state_dim,action_dim,cfg)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
agent = PPO(n_states,n_actions,cfg)
|
||||
return env,agent
|
||||
|
||||
|
||||
|
||||
@@ -90,9 +90,9 @@
|
||||
"def env_agent_config(cfg,seed=1):\n",
|
||||
" env = gym.make(cfg.env) \n",
|
||||
" env.seed(seed)\n",
|
||||
" state_dim = env.observation_space.shape[0]\n",
|
||||
" action_dim = env.action_space.n\n",
|
||||
" agent = PPO(state_dim,action_dim,cfg)\n",
|
||||
" n_states = env.observation_space.shape[0]\n",
|
||||
" n_actions = env.action_space.n\n",
|
||||
" agent = PPO(n_states,n_actions,cfg)\n",
|
||||
" return env,agent"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -99,9 +99,9 @@ if __name__ == '__main__':
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.n
|
||||
agent = PPO(state_dim,action_dim,cfg)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
agent = PPO(n_states,n_actions,cfg)
|
||||
return env,agent
|
||||
|
||||
cfg = PPOConfig()
|
||||
|
||||
@@ -17,9 +17,9 @@ from PolicyGradient.model import MLP
|
||||
|
||||
class PolicyGradient:
|
||||
|
||||
def __init__(self, state_dim,cfg):
|
||||
def __init__(self, n_states,cfg):
|
||||
self.gamma = cfg.gamma
|
||||
self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
|
||||
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
|
||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.batch_size = cfg.batch_size
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ class MLP(nn.Module):
|
||||
'''
|
||||
def __init__(self,input_dim,hidden_dim = 36):
|
||||
super(MLP, self).__init__()
|
||||
# 24和36为hidden layer的层数,可根据input_dim, action_dim的情况来改变
|
||||
# 24和36为hidden layer的层数,可根据input_dim, n_actions的情况来改变
|
||||
self.fc1 = nn.Linear(input_dim, hidden_dim)
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
||||
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
|
||||
|
||||
@@ -46,8 +46,8 @@ class PGConfig:
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env)
|
||||
env.seed(seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
agent = PolicyGradient(state_dim,cfg)
|
||||
n_states = env.observation_space.shape[0]
|
||||
agent = PolicyGradient(n_states,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
**注意:新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面,```plot```放到了```common.utils```中。**
|
||||
## 运行环境
|
||||
|
||||
python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
|
||||
python 3.7、pytorch 1.6.0-1.8.1、gym 0.21.0
|
||||
|
||||
## 使用说明
|
||||
|
||||
@@ -36,7 +36,7 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
|
||||
| [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [A2C](./A2C) | [A3C Paper](https://arxiv.org/abs/1602.01783) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [SAC](./SAC) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
|
||||
| [SAC](./SoftActorCritic) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
|
||||
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [DDPG](./DDPG) | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
|
||||
| [TD3](./TD3) | [TD3 Paper](https://arxiv.org/abs/1802.09477) | [HalfCheetah-v2]((./envs/mujoco_info.md)) | |
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-29 12:53:54
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-04-29 13:56:39
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import copy
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
from common.memory import ReplayBuffer
|
||||
from SAC.model import ValueNet,PolicyNet,SoftQNet
|
||||
|
||||
class SAC:
|
||||
def __init__(self,state_dim,action_dim,cfg) -> None:
|
||||
self.batch_size = cfg.batch_size
|
||||
self.memory = ReplayBuffer(cfg.capacity)
|
||||
self.device = cfg.device
|
||||
self.value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device)
|
||||
self.target_value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device)
|
||||
self.soft_q_net = SoftQNet(state_dim, action_dim, cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = PolicyNet(state_dim, action_dim, cfg.hidden_dim).to(self.device)
|
||||
self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
|
||||
self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
|
||||
self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)
|
||||
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
|
||||
target_param.data.copy_(param.data)
|
||||
self.value_criterion = nn.MSELoss()
|
||||
self.soft_q_criterion = nn.MSELoss()
|
||||
def update(self, gamma=0.99,mean_lambda=1e-3,
|
||||
std_lambda=1e-3,
|
||||
z_lambda=0.0,
|
||||
soft_tau=1e-2,
|
||||
):
|
||||
if len(self.memory) < self.batch_size:
|
||||
return
|
||||
state, action, reward, next_state, done = self.memory.sample(self.batch_size)
|
||||
state = torch.FloatTensor(state).to(self.device)
|
||||
next_state = torch.FloatTensor(next_state).to(self.device)
|
||||
action = torch.FloatTensor(action).to(self.device)
|
||||
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
|
||||
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
|
||||
expected_q_value = self.soft_q_net(state, action)
|
||||
expected_value = self.value_net(state)
|
||||
new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
|
||||
|
||||
|
||||
target_value = self.target_value_net(next_state)
|
||||
next_q_value = reward + (1 - done) * gamma * target_value
|
||||
q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
|
||||
|
||||
expected_new_q_value = self.soft_q_net(state, new_action)
|
||||
next_value = expected_new_q_value - log_prob
|
||||
value_loss = self.value_criterion(expected_value, next_value.detach())
|
||||
|
||||
log_prob_target = expected_new_q_value - expected_value
|
||||
policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
|
||||
|
||||
|
||||
mean_loss = mean_lambda * mean.pow(2).mean()
|
||||
std_loss = std_lambda * log_std.pow(2).mean()
|
||||
z_loss = z_lambda * z.pow(2).sum(1).mean()
|
||||
|
||||
policy_loss += mean_loss + std_loss + z_loss
|
||||
|
||||
self.soft_q_optimizer.zero_grad()
|
||||
q_value_loss.backward()
|
||||
self.soft_q_optimizer.step()
|
||||
|
||||
self.value_optimizer.zero_grad()
|
||||
value_loss.backward()
|
||||
self.value_optimizer.step()
|
||||
|
||||
self.policy_optimizer.zero_grad()
|
||||
policy_loss.backward()
|
||||
self.policy_optimizer.step()
|
||||
|
||||
|
||||
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
|
||||
target_param.data.copy_(
|
||||
target_param.data * (1.0 - soft_tau) + param.data * soft_tau
|
||||
)
|
||||
def save(self, path):
|
||||
torch.save(self.value_net.state_dict(), path + "sac_value")
|
||||
torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer")
|
||||
|
||||
torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q")
|
||||
torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer")
|
||||
|
||||
torch.save(self.policy_net.state_dict(), path + "sac_policy")
|
||||
torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer")
|
||||
|
||||
|
||||
|
||||
def load(self, path):
|
||||
self.value_net.load_state_dict(torch.load(path + "sac_value"))
|
||||
self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer"))
|
||||
self.target_value_net = copy.deepcopy(self.value_net)
|
||||
|
||||
self.soft_q_net.load_state_dict(torch.load(path + "sac_soft_q"))
|
||||
self.soft_q_optimizer.load_state_dict(torch.load(path + "sac_soft_q_optimizer"))
|
||||
|
||||
self.policy_net.load_state_dict(torch.load(path + "sac_policy"))
|
||||
self.policy_optimizer.load_state_dict(torch.load(path + "sac_policy_optimizer"))
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 59 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 58 KiB |
@@ -14,17 +14,17 @@ from collections import defaultdict
|
||||
import torch
|
||||
class Sarsa(object):
|
||||
def __init__(self,
|
||||
action_dim,sarsa_cfg,):
|
||||
self.action_dim = action_dim # number of actions
|
||||
n_actions,sarsa_cfg,):
|
||||
self.n_actions = n_actions # number of actions
|
||||
self.lr = sarsa_cfg.lr # learning rate
|
||||
self.gamma = sarsa_cfg.gamma
|
||||
self.epsilon = sarsa_cfg.epsilon
|
||||
self.Q = defaultdict(lambda: np.zeros(action_dim))
|
||||
# self.Q = np.zeros((state_dim, action_dim)) # Q表
|
||||
self.Q = defaultdict(lambda: np.zeros(n_actions))
|
||||
# self.Q = np.zeros((n_states, n_actions)) # Q表
|
||||
def choose_action(self, state):
|
||||
best_action = np.argmax(self.Q[state])
|
||||
# action = best_action
|
||||
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
|
||||
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
|
||||
action_probs[best_action] += (1.0 - self.epsilon)
|
||||
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
||||
return action
|
||||
|
||||
@@ -39,8 +39,8 @@ class SarsaConfig:
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = RacetrackEnv()
|
||||
action_dim=9
|
||||
agent = Sarsa(action_dim,cfg)
|
||||
n_actions=9
|
||||
agent = Sarsa(n_actions,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
|
||||
@@ -5,12 +5,13 @@ Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-29 12:52:11
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-04-29 12:52:31
|
||||
LastEditTime: 2021-12-22 15:36:36
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import gym
|
||||
import numpy as np
|
||||
|
||||
class NormalizedActions(gym.ActionWrapper):
|
||||
def action(self, action):
|
||||
low = self.action_space.low
|
||||
@@ -17,10 +17,10 @@ from torch.distributions import Normal
|
||||
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
class ValueNet(nn.Module):
|
||||
def __init__(self, state_dim, hidden_dim, init_w=3e-3):
|
||||
def __init__(self, n_states, hidden_dim, init_w=3e-3):
|
||||
super(ValueNet, self).__init__()
|
||||
|
||||
self.linear1 = nn.Linear(state_dim, hidden_dim)
|
||||
self.linear1 = nn.Linear(n_states, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
|
||||
self.linear3 = nn.Linear(hidden_dim, 1)
|
||||
|
||||
@@ -35,10 +35,10 @@ class ValueNet(nn.Module):
|
||||
|
||||
|
||||
class SoftQNet(nn.Module):
|
||||
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
|
||||
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
|
||||
super(SoftQNet, self).__init__()
|
||||
|
||||
self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
|
||||
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
|
||||
self.linear3 = nn.Linear(hidden_dim, 1)
|
||||
|
||||
@@ -54,20 +54,20 @@ class SoftQNet(nn.Module):
|
||||
|
||||
|
||||
class PolicyNet(nn.Module):
|
||||
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
|
||||
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
|
||||
super(PolicyNet, self).__init__()
|
||||
|
||||
self.log_std_min = log_std_min
|
||||
self.log_std_max = log_std_max
|
||||
|
||||
self.linear1 = nn.Linear(state_dim, hidden_dim)
|
||||
self.linear1 = nn.Linear(n_states, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
|
||||
|
||||
self.mean_linear = nn.Linear(hidden_dim, action_dim)
|
||||
self.mean_linear = nn.Linear(hidden_dim, n_actions)
|
||||
self.mean_linear.weight.data.uniform_(-init_w, init_w)
|
||||
self.mean_linear.bias.data.uniform_(-init_w, init_w)
|
||||
|
||||
self.log_std_linear = nn.Linear(hidden_dim, action_dim)
|
||||
self.log_std_linear = nn.Linear(hidden_dim, n_actions)
|
||||
self.log_std_linear.weight.data.uniform_(-init_w, init_w)
|
||||
self.log_std_linear.bias.data.uniform_(-init_w, init_w)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 44 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 60 KiB |
222
codes/SoftActorCritic/sac.py
Normal file
222
codes/SoftActorCritic/sac.py
Normal file
@@ -0,0 +1,222 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-29 12:53:54
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-12-22 15:41:19
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import copy
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.nn.functional as F
|
||||
from torch.distributions import Normal
|
||||
import numpy as np
|
||||
import random
|
||||
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class ValueNet(nn.Module):
|
||||
def __init__(self, n_states, hidden_dim, init_w=3e-3):
|
||||
super(ValueNet, self).__init__()
|
||||
|
||||
self.linear1 = nn.Linear(n_states, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
|
||||
self.linear3 = nn.Linear(hidden_dim, 1)
|
||||
|
||||
self.linear3.weight.data.uniform_(-init_w, init_w)
|
||||
self.linear3.bias.data.uniform_(-init_w, init_w)
|
||||
|
||||
def forward(self, state):
|
||||
x = F.relu(self.linear1(state))
|
||||
x = F.relu(self.linear2(x))
|
||||
x = self.linear3(x)
|
||||
return x
|
||||
|
||||
|
||||
class SoftQNet(nn.Module):
|
||||
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
|
||||
super(SoftQNet, self).__init__()
|
||||
|
||||
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
|
||||
self.linear3 = nn.Linear(hidden_dim, 1)
|
||||
|
||||
self.linear3.weight.data.uniform_(-init_w, init_w)
|
||||
self.linear3.bias.data.uniform_(-init_w, init_w)
|
||||
|
||||
def forward(self, state, action):
|
||||
x = torch.cat([state, action], 1)
|
||||
x = F.relu(self.linear1(x))
|
||||
x = F.relu(self.linear2(x))
|
||||
x = self.linear3(x)
|
||||
return x
|
||||
|
||||
|
||||
class PolicyNet(nn.Module):
|
||||
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
|
||||
super(PolicyNet, self).__init__()
|
||||
|
||||
self.log_std_min = log_std_min
|
||||
self.log_std_max = log_std_max
|
||||
|
||||
self.linear1 = nn.Linear(n_states, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
|
||||
|
||||
self.mean_linear = nn.Linear(hidden_dim, n_actions)
|
||||
self.mean_linear.weight.data.uniform_(-init_w, init_w)
|
||||
self.mean_linear.bias.data.uniform_(-init_w, init_w)
|
||||
|
||||
self.log_std_linear = nn.Linear(hidden_dim, n_actions)
|
||||
self.log_std_linear.weight.data.uniform_(-init_w, init_w)
|
||||
self.log_std_linear.bias.data.uniform_(-init_w, init_w)
|
||||
|
||||
def forward(self, state):
|
||||
x = F.relu(self.linear1(state))
|
||||
x = F.relu(self.linear2(x))
|
||||
|
||||
mean = self.mean_linear(x)
|
||||
log_std = self.log_std_linear(x)
|
||||
log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
|
||||
|
||||
return mean, log_std
|
||||
|
||||
def evaluate(self, state, epsilon=1e-6):
|
||||
mean, log_std = self.forward(state)
|
||||
std = log_std.exp()
|
||||
|
||||
normal = Normal(mean, std)
|
||||
z = normal.sample()
|
||||
action = torch.tanh(z)
|
||||
|
||||
log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
|
||||
log_prob = log_prob.sum(-1, keepdim=True)
|
||||
|
||||
return action, log_prob, z, mean, log_std
|
||||
|
||||
|
||||
def get_action(self, state):
|
||||
state = torch.FloatTensor(state).unsqueeze(0).to(device)
|
||||
mean, log_std = self.forward(state)
|
||||
std = log_std.exp()
|
||||
|
||||
normal = Normal(mean, std)
|
||||
z = normal.sample()
|
||||
action = torch.tanh(z)
|
||||
|
||||
action = action.detach().cpu().numpy()
|
||||
return action[0]
|
||||
|
||||
class SAC:
|
||||
def __init__(self,n_states,n_actions,cfg) -> None:
|
||||
self.batch_size = cfg.batch_size
|
||||
self.memory = ReplayBuffer(cfg.capacity)
|
||||
self.device = cfg.device
|
||||
self.value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device)
|
||||
self.target_value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device)
|
||||
self.soft_q_net = SoftQNet(n_states, n_actions, cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = PolicyNet(n_states, n_actions, cfg.hidden_dim).to(self.device)
|
||||
self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
|
||||
self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
|
||||
self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)
|
||||
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
|
||||
target_param.data.copy_(param.data)
|
||||
self.value_criterion = nn.MSELoss()
|
||||
self.soft_q_criterion = nn.MSELoss()
|
||||
def update(self, gamma=0.99,mean_lambda=1e-3,
|
||||
std_lambda=1e-3,
|
||||
z_lambda=0.0,
|
||||
soft_tau=1e-2,
|
||||
):
|
||||
if len(self.memory) < self.batch_size:
|
||||
return
|
||||
state, action, reward, next_state, done = self.memory.sample(self.batch_size)
|
||||
state = torch.FloatTensor(state).to(self.device)
|
||||
next_state = torch.FloatTensor(next_state).to(self.device)
|
||||
action = torch.FloatTensor(action).to(self.device)
|
||||
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
|
||||
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
|
||||
expected_q_value = self.soft_q_net(state, action)
|
||||
expected_value = self.value_net(state)
|
||||
new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
|
||||
|
||||
|
||||
target_value = self.target_value_net(next_state)
|
||||
next_q_value = reward + (1 - done) * gamma * target_value
|
||||
q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
|
||||
|
||||
expected_new_q_value = self.soft_q_net(state, new_action)
|
||||
next_value = expected_new_q_value - log_prob
|
||||
value_loss = self.value_criterion(expected_value, next_value.detach())
|
||||
|
||||
log_prob_target = expected_new_q_value - expected_value
|
||||
policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
|
||||
|
||||
|
||||
mean_loss = mean_lambda * mean.pow(2).mean()
|
||||
std_loss = std_lambda * log_std.pow(2).mean()
|
||||
z_loss = z_lambda * z.pow(2).sum(1).mean()
|
||||
|
||||
policy_loss += mean_loss + std_loss + z_loss
|
||||
|
||||
self.soft_q_optimizer.zero_grad()
|
||||
q_value_loss.backward()
|
||||
self.soft_q_optimizer.step()
|
||||
|
||||
self.value_optimizer.zero_grad()
|
||||
value_loss.backward()
|
||||
self.value_optimizer.step()
|
||||
|
||||
self.policy_optimizer.zero_grad()
|
||||
policy_loss.backward()
|
||||
self.policy_optimizer.step()
|
||||
|
||||
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
|
||||
target_param.data.copy_(
|
||||
target_param.data * (1.0 - soft_tau) + param.data * soft_tau
|
||||
)
|
||||
def save(self, path):
|
||||
torch.save(self.value_net.state_dict(), path + "sac_value")
|
||||
torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer")
|
||||
torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q")
|
||||
torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer")
|
||||
|
||||
torch.save(self.policy_net.state_dict(), path + "sac_policy")
|
||||
torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer")
|
||||
|
||||
def load(self, path):
|
||||
self.value_net.load_state_dict(torch.load(path + "sac_value"))
|
||||
self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer"))
|
||||
self.target_value_net = copy.deepcopy(self.value_net)
|
||||
|
||||
self.soft_q_net.load_state_dict(torch.load(path + "sac_soft_q"))
|
||||
self.soft_q_optimizer.load_state_dict(torch.load(path + "sac_soft_q_optimizer"))
|
||||
|
||||
self.policy_net.load_state_dict(torch.load(path + "sac_policy"))
|
||||
self.policy_optimizer.load_state_dict(torch.load(path + "sac_policy_optimizer"))
|
||||
@@ -5,7 +5,7 @@ Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-04-29 12:59:22
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-05-06 16:58:01
|
||||
LastEditTime: 2021-12-22 16:27:13
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -18,23 +18,24 @@ import gym
|
||||
import torch
|
||||
import datetime
|
||||
|
||||
from SAC.env import NormalizedActions
|
||||
from SAC.agent import SAC
|
||||
from SoftActorCritic.env_wrapper import NormalizedActions
|
||||
from SoftActorCritic.sac import SAC
|
||||
from common.utils import save_results, make_dir
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import plot_rewards
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
algo_name = 'SAC' # 算法名称
|
||||
env_name = 'Pendulum-v1' # 环境名称
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||
|
||||
class SACConfig:
|
||||
def __init__(self) -> None:
|
||||
self.algo = 'SAC'
|
||||
self.env_name = 'Pendulum-v1'
|
||||
self.result_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models
|
||||
self.algo_name = algo_name
|
||||
self.env_name = env_name # 环境名称
|
||||
self.device= device
|
||||
self.train_eps = 300
|
||||
self.train_steps = 500
|
||||
self.test_eps = 50
|
||||
self.eval_steps = 500
|
||||
self.test_eps = 20
|
||||
self.max_steps = 500 # 每回合的最大步数
|
||||
self.gamma = 0.99
|
||||
self.mean_lambda=1e-3
|
||||
self.std_lambda=1e-3
|
||||
@@ -46,33 +47,36 @@ class SACConfig:
|
||||
self.capacity = 1000000
|
||||
self.hidden_dim = 256
|
||||
self.batch_size = 128
|
||||
self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
class PlotConfig(SACConfig):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.result_path = curr_path+"/outputs/" + self.env_name + \
|
||||
'/'+curr_time+'/results/' # 保存结果的路径
|
||||
self.model_path = curr_path+"/outputs/" + self.env_name + \
|
||||
'/'+curr_time+'/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
|
||||
|
||||
class PlotConfig:
|
||||
def __init__(self) -> None:
|
||||
self.algo_name = algo_name # 算法名称
|
||||
self.env_name = env_name # 环境名称
|
||||
self.device= device
|
||||
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/results/' # 保存结果的路径
|
||||
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = NormalizedActions(gym.make(cfg.env_name))
|
||||
env.seed(seed)
|
||||
action_dim = env.action_space.shape[0]
|
||||
state_dim = env.observation_space.shape[0]
|
||||
agent = SAC(state_dim,action_dim,cfg)
|
||||
n_actions = env.action_space.shape[0]
|
||||
n_states = env.observation_space.shape[0]
|
||||
agent = SAC(n_states,n_actions,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
for i_step in range(cfg.train_steps):
|
||||
for i_step in range(cfg.max_steps):
|
||||
action = agent.policy_net.get_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
agent.memory.push(state, action, reward, next_state, done)
|
||||
@@ -81,57 +85,57 @@ def train(cfg,env,agent):
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1)%10==0:
|
||||
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('Complete training!')
|
||||
if (i_ep+1)%10 == 0:
|
||||
print(f'回合:{i_ep+1}/{cfg.train_eps}, 奖励:{ep_reward:.3f}')
|
||||
print('完成训练!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
def eval(cfg,env,agent):
|
||||
print('Start to eval !')
|
||||
print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = [] # moveing average reward
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for i_step in range(cfg.eval_steps):
|
||||
for i_step in range(cfg.max_steps):
|
||||
action = agent.policy_net.get_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1)%10==0:
|
||||
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('Complete evaling!')
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg=SACConfig()
|
||||
plot_cfg = PlotConfig()
|
||||
# train
|
||||
env,agent = env_agent_config(cfg,seed=1)
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path)
|
||||
agent.save(path=plot_cfg.model_path)
|
||||
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
|
||||
# eval
|
||||
env,agent = env_agent_config(cfg,seed=10)
|
||||
agent.load(path=plot_cfg.model_path)
|
||||
rewards,ma_rewards = eval(cfg,env,agent)
|
||||
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
|
||||
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||
|
||||
|
||||
|
||||
@@ -70,9 +70,9 @@
|
||||
"def env_agent_config(cfg,seed=1):\n",
|
||||
" env = NormalizedActions(gym.make(\"Pendulum-v0\"))\n",
|
||||
" env.seed(seed)\n",
|
||||
" action_dim = env.action_space.shape[0]\n",
|
||||
" state_dim = env.observation_space.shape[0]\n",
|
||||
" agent = SAC(state_dim,action_dim,cfg)\n",
|
||||
" n_actions = env.action_space.shape[0]\n",
|
||||
" n_states = env.observation_space.shape[0]\n",
|
||||
" agent = SAC(n_states,n_actions,cfg)\n",
|
||||
" return env,agent"
|
||||
]
|
||||
},
|
||||
@@ -159,7 +159,7 @@
|
||||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||
"\u001b[0;31mDeprecatedEnv\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-7-91b1038013e4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0magent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mrewards\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma_rewards\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0magent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mmake_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m<ipython-input-4-040773221550>\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0maction_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstate_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m<ipython-input-4-040773221550>\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_actions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mn_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(id, **kwargs)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mregistry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(self, path, **kwargs)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Making new env: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mspec\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 185\u001b[0m raise error.DeprecatedEnv(\n\u001b[1;32m 186\u001b[0m \"Env {} not found (valid versions include {})\".format(\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmatching_envs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m )\n\u001b[1;32m 189\u001b[0m )\n",
|
||||
@@ -14,13 +14,13 @@ import torch
|
||||
|
||||
|
||||
class ReplayBuffer(object):
|
||||
def __init__(self, state_dim, action_dim, max_size=int(1e6)):
|
||||
def __init__(self, n_states, n_actions, max_size=int(1e6)):
|
||||
self.max_size = max_size
|
||||
self.ptr = 0
|
||||
self.size = 0
|
||||
self.state = np.zeros((max_size, state_dim))
|
||||
self.action = np.zeros((max_size, action_dim))
|
||||
self.next_state = np.zeros((max_size, state_dim))
|
||||
self.state = np.zeros((max_size, n_states))
|
||||
self.action = np.zeros((max_size, n_actions))
|
||||
self.next_state = np.zeros((max_size, n_states))
|
||||
self.reward = np.zeros((max_size, 1))
|
||||
self.not_done = np.zeros((max_size, 1))
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
@@ -74,10 +74,10 @@ if __name__ == "__main__":
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
td3= TD3(state_dim,action_dim,max_action,cfg)
|
||||
td3= TD3(n_states,n_actions,max_action,cfg)
|
||||
cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
|
||||
td3.load(cfg.model_path)
|
||||
td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed)
|
||||
|
||||
@@ -72,7 +72,7 @@ def train(cfg,env,agent):
|
||||
else:
|
||||
action = (
|
||||
agent.choose_action(np.array(state))
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
).clip(-max_action, max_action)
|
||||
# Perform action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
@@ -121,11 +121,11 @@ def train(cfg,env,agent):
|
||||
# else:
|
||||
# action = (
|
||||
# agent.choose_action(np.array(state))
|
||||
# + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
|
||||
# + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
# ).clip(-max_action, max_action)
|
||||
# # action = (
|
||||
# # agent.choose_action(np.array(state))
|
||||
# # + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
|
||||
# # + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
# # ).clip(-max_action, max_action)
|
||||
# # Perform action
|
||||
# next_state, reward, done, _ = env.step(action)
|
||||
@@ -157,10 +157,10 @@ if __name__ == "__main__":
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
agent = TD3(state_dim,action_dim,max_action,cfg)
|
||||
agent = TD3(n_states,n_actions,max_action,cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
|
||||
@@ -70,10 +70,10 @@ if __name__ == "__main__":
|
||||
env.seed(cfg.seed) # Set seeds
|
||||
torch.manual_seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
td3= TD3(state_dim,action_dim,max_action,cfg)
|
||||
td3= TD3(n_states,n_actions,max_action,cfg)
|
||||
cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/'
|
||||
cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/'
|
||||
td3.load(cfg.model_path)
|
||||
|
||||
@@ -79,7 +79,7 @@ def train(cfg,env,agent):
|
||||
else:
|
||||
action = (
|
||||
agent.choose_action(np.array(state))
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
|
||||
+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
|
||||
).clip(-max_action, max_action)
|
||||
# Perform action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
@@ -109,10 +109,10 @@ if __name__ == "__main__":
|
||||
env.seed(1) # 随机种子
|
||||
torch.manual_seed(1)
|
||||
np.random.seed(1)
|
||||
state_dim = env.observation_space.shape[0]
|
||||
action_dim = env.action_space.shape[0]
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.shape[0]
|
||||
max_action = float(env.action_space.high[0])
|
||||
agent = TD3(state_dim,action_dim,max_action,cfg)
|
||||
agent = TD3(n_states,n_actions,max_action,cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(plot_cfg.result_path,plot_cfg.model_path)
|
||||
agent.save(path=plot_cfg.model_path)
|
||||
|
||||
284
codes/common/atari_wrappers.py
Normal file
284
codes/common/atari_wrappers.py
Normal file
@@ -0,0 +1,284 @@
|
||||
import numpy as np
|
||||
import os
|
||||
os.environ.setdefault('PATH', '')
|
||||
from collections import deque
|
||||
import gym
|
||||
from gym import spaces
|
||||
import cv2
|
||||
cv2.ocl.setUseOpenCL(False)
|
||||
from .wrappers import TimeLimit
|
||||
|
||||
|
||||
class NoopResetEnv(gym.Wrapper):
|
||||
def __init__(self, env, noop_max=30):
|
||||
"""Sample initial states by taking random number of no-ops on reset.
|
||||
No-op is assumed to be action 0.
|
||||
"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.noop_max = noop_max
|
||||
self.override_num_noops = None
|
||||
self.noop_action = 0
|
||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||
|
||||
def reset(self, **kwargs):
|
||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
||||
self.env.reset(**kwargs)
|
||||
if self.override_num_noops is not None:
|
||||
noops = self.override_num_noops
|
||||
else:
|
||||
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
|
||||
assert noops > 0
|
||||
obs = None
|
||||
for _ in range(noops):
|
||||
obs, _, done, _ = self.env.step(self.noop_action)
|
||||
if done:
|
||||
obs = self.env.reset(**kwargs)
|
||||
return obs
|
||||
|
||||
def step(self, ac):
|
||||
return self.env.step(ac)
|
||||
|
||||
class FireResetEnv(gym.Wrapper):
|
||||
def __init__(self, env):
|
||||
"""Take action on reset for environments that are fixed until firing."""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self.env.reset(**kwargs)
|
||||
obs, _, done, _ = self.env.step(1)
|
||||
if done:
|
||||
self.env.reset(**kwargs)
|
||||
obs, _, done, _ = self.env.step(2)
|
||||
if done:
|
||||
self.env.reset(**kwargs)
|
||||
return obs
|
||||
|
||||
def step(self, ac):
|
||||
return self.env.step(ac)
|
||||
|
||||
class EpisodicLifeEnv(gym.Wrapper):
|
||||
def __init__(self, env):
|
||||
"""Make end-of-life == end-of-episode, but only reset on true game over.
|
||||
Done by DeepMind for the DQN and co. since it helps value estimation.
|
||||
"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.lives = 0
|
||||
self.was_real_done = True
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, done, info = self.env.step(action)
|
||||
self.was_real_done = done
|
||||
# check current lives, make loss of life terminal,
|
||||
# then update lives to handle bonus lives
|
||||
lives = self.env.unwrapped.ale.lives()
|
||||
if lives < self.lives and lives > 0:
|
||||
# for Qbert sometimes we stay in lives == 0 condition for a few frames
|
||||
# so it's important to keep lives > 0, so that we only reset once
|
||||
# the environment advertises done.
|
||||
done = True
|
||||
self.lives = lives
|
||||
return obs, reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
"""Reset only when lives are exhausted.
|
||||
This way all states are still reachable even though lives are episodic,
|
||||
and the learner need not know about any of this behind-the-scenes.
|
||||
"""
|
||||
if self.was_real_done:
|
||||
obs = self.env.reset(**kwargs)
|
||||
else:
|
||||
# no-op step to advance from terminal/lost life state
|
||||
obs, _, _, _ = self.env.step(0)
|
||||
self.lives = self.env.unwrapped.ale.lives()
|
||||
return obs
|
||||
|
||||
class MaxAndSkipEnv(gym.Wrapper):
|
||||
def __init__(self, env, skip=4):
|
||||
"""Return only every `skip`-th frame"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
# most recent raw observations (for max pooling across time steps)
|
||||
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
|
||||
self._skip = skip
|
||||
|
||||
def step(self, action):
|
||||
"""Repeat action, sum reward, and max over last observations."""
|
||||
total_reward = 0.0
|
||||
done = None
|
||||
for i in range(self._skip):
|
||||
obs, reward, done, info = self.env.step(action)
|
||||
if i == self._skip - 2: self._obs_buffer[0] = obs
|
||||
if i == self._skip - 1: self._obs_buffer[1] = obs
|
||||
total_reward += reward
|
||||
if done:
|
||||
break
|
||||
# Note that the observation on the done=True frame
|
||||
# doesn't matter
|
||||
max_frame = self._obs_buffer.max(axis=0)
|
||||
|
||||
return max_frame, total_reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
class ClipRewardEnv(gym.RewardWrapper):
|
||||
def __init__(self, env):
|
||||
gym.RewardWrapper.__init__(self, env)
|
||||
|
||||
def reward(self, reward):
|
||||
"""Bin reward to {+1, 0, -1} by its sign."""
|
||||
return np.sign(reward)
|
||||
|
||||
|
||||
class WarpFrame(gym.ObservationWrapper):
|
||||
def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
|
||||
"""
|
||||
Warp frames to 84x84 as done in the Nature paper and later work.
|
||||
If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
|
||||
observation should be warped.
|
||||
"""
|
||||
super().__init__(env)
|
||||
self._width = width
|
||||
self._height = height
|
||||
self._grayscale = grayscale
|
||||
self._key = dict_space_key
|
||||
if self._grayscale:
|
||||
num_colors = 1
|
||||
else:
|
||||
num_colors = 3
|
||||
|
||||
new_space = gym.spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(self._height, self._width, num_colors),
|
||||
dtype=np.uint8,
|
||||
)
|
||||
if self._key is None:
|
||||
original_space = self.observation_space
|
||||
self.observation_space = new_space
|
||||
else:
|
||||
original_space = self.observation_space.spaces[self._key]
|
||||
self.observation_space.spaces[self._key] = new_space
|
||||
assert original_space.dtype == np.uint8 and len(original_space.shape) == 3
|
||||
|
||||
def observation(self, obs):
|
||||
if self._key is None:
|
||||
frame = obs
|
||||
else:
|
||||
frame = obs[self._key]
|
||||
|
||||
if self._grayscale:
|
||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||
frame = cv2.resize(
|
||||
frame, (self._width, self._height), interpolation=cv2.INTER_AREA
|
||||
)
|
||||
if self._grayscale:
|
||||
frame = np.expand_dims(frame, -1)
|
||||
|
||||
if self._key is None:
|
||||
obs = frame
|
||||
else:
|
||||
obs = obs.copy()
|
||||
obs[self._key] = frame
|
||||
return obs
|
||||
|
||||
|
||||
class FrameStack(gym.Wrapper):
|
||||
def __init__(self, env, k):
|
||||
"""Stack k last frames.
|
||||
Returns lazy array, which is much more memory efficient.
|
||||
See Also
|
||||
--------
|
||||
baselines.common.atari_wrappers.LazyFrames
|
||||
"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.k = k
|
||||
self.frames = deque([], maxlen=k)
|
||||
shp = env.observation_space.shape
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
|
||||
|
||||
def reset(self):
|
||||
ob = self.env.reset()
|
||||
for _ in range(self.k):
|
||||
self.frames.append(ob)
|
||||
return self._get_ob()
|
||||
|
||||
def step(self, action):
|
||||
ob, reward, done, info = self.env.step(action)
|
||||
self.frames.append(ob)
|
||||
return self._get_ob(), reward, done, info
|
||||
|
||||
def _get_ob(self):
|
||||
assert len(self.frames) == self.k
|
||||
return LazyFrames(list(self.frames))
|
||||
|
||||
class ScaledFloatFrame(gym.ObservationWrapper):
|
||||
def __init__(self, env):
|
||||
gym.ObservationWrapper.__init__(self, env)
|
||||
self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
|
||||
|
||||
def observation(self, observation):
|
||||
# careful! This undoes the memory optimization, use
|
||||
# with smaller replay buffers only.
|
||||
return np.array(observation).astype(np.float32) / 255.0
|
||||
|
||||
class LazyFrames(object):
|
||||
def __init__(self, frames):
|
||||
"""This object ensures that common frames between the observations are only stored once.
|
||||
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
|
||||
buffers.
|
||||
This object should only be converted to numpy array before being passed to the model.
|
||||
You'd not believe how complex the previous solution was."""
|
||||
self._frames = frames
|
||||
self._out = None
|
||||
|
||||
def _force(self):
|
||||
if self._out is None:
|
||||
self._out = np.concatenate(self._frames, axis=-1)
|
||||
self._frames = None
|
||||
return self._out
|
||||
|
||||
def __array__(self, dtype=None):
|
||||
out = self._force()
|
||||
if dtype is not None:
|
||||
out = out.astype(dtype)
|
||||
return out
|
||||
|
||||
def __len__(self):
|
||||
return len(self._force())
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self._force()[i]
|
||||
|
||||
def count(self):
|
||||
frames = self._force()
|
||||
return frames.shape[frames.ndim - 1]
|
||||
|
||||
def frame(self, i):
|
||||
return self._force()[..., i]
|
||||
|
||||
def make_atari(env_id, max_episode_steps=None):
|
||||
env = gym.make(env_id)
|
||||
assert 'NoFrameskip' in env.spec.id
|
||||
env = NoopResetEnv(env, noop_max=30)
|
||||
env = MaxAndSkipEnv(env, skip=4)
|
||||
if max_episode_steps is not None:
|
||||
env = TimeLimit(env, max_episode_steps=max_episode_steps)
|
||||
return env
|
||||
|
||||
def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
|
||||
"""Configure environment for DeepMind-style Atari.
|
||||
"""
|
||||
if episode_life:
|
||||
env = EpisodicLifeEnv(env)
|
||||
if 'FIRE' in env.unwrapped.get_action_meanings():
|
||||
env = FireResetEnv(env)
|
||||
env = WarpFrame(env)
|
||||
if scale:
|
||||
env = ScaledFloatFrame(env)
|
||||
if clip_rewards:
|
||||
env = ClipRewardEnv(env)
|
||||
if frame_stack:
|
||||
env = FrameStack(env, 4)
|
||||
return env
|
||||
@@ -32,10 +32,10 @@ class MLP(nn.Module):
|
||||
return self.fc3(x)
|
||||
|
||||
class Critic(nn.Module):
|
||||
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
|
||||
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
|
||||
super(Critic, self).__init__()
|
||||
|
||||
self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
|
||||
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
|
||||
self.linear2 = nn.Linear(hidden_size, hidden_size)
|
||||
self.linear3 = nn.Linear(hidden_size, 1)
|
||||
# 随机初始化为较小的值
|
||||
@@ -51,11 +51,11 @@ class Critic(nn.Module):
|
||||
return x
|
||||
|
||||
class Actor(nn.Module):
|
||||
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
|
||||
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
|
||||
super(Actor, self).__init__()
|
||||
self.linear1 = nn.Linear(n_obs, hidden_size)
|
||||
self.linear2 = nn.Linear(hidden_size, hidden_size)
|
||||
self.linear3 = nn.Linear(hidden_size, action_dim)
|
||||
self.linear3 = nn.Linear(hidden_size, n_actions)
|
||||
|
||||
self.linear3.weight.data.uniform_(-init_w, init_w)
|
||||
self.linear3.bias.data.uniform_(-init_w, init_w)
|
||||
@@ -67,18 +67,18 @@ class Actor(nn.Module):
|
||||
return x
|
||||
|
||||
class ActorCritic(nn.Module):
|
||||
def __init__(self, state_dim, action_dim, hidden_dim=256):
|
||||
def __init__(self, n_states, n_actions, hidden_dim=256):
|
||||
super(ActorCritic, self).__init__()
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim),
|
||||
nn.Linear(n_states, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, 1)
|
||||
)
|
||||
|
||||
self.actor = nn.Sequential(
|
||||
nn.Linear(state_dim, hidden_dim),
|
||||
nn.Linear(n_states, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, action_dim),
|
||||
nn.Linear(hidden_dim, n_actions),
|
||||
nn.Softmax(dim=1),
|
||||
)
|
||||
|
||||
|
||||
29
codes/common/wrappers.py
Normal file
29
codes/common/wrappers.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import gym
|
||||
|
||||
class TimeLimit(gym.Wrapper):
|
||||
def __init__(self, env, max_episode_steps=None):
|
||||
super(TimeLimit, self).__init__(env)
|
||||
self._max_episode_steps = max_episode_steps
|
||||
self._elapsed_steps = 0
|
||||
|
||||
def step(self, ac):
|
||||
observation, reward, done, info = self.env.step(ac)
|
||||
self._elapsed_steps += 1
|
||||
if self._elapsed_steps >= self._max_episode_steps:
|
||||
done = True
|
||||
info['TimeLimit.truncated'] = True
|
||||
return observation, reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self._elapsed_steps = 0
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
class ClipActionsWrapper(gym.Wrapper):
|
||||
def step(self, action):
|
||||
import numpy as np
|
||||
action = np.nan_to_num(action)
|
||||
action = np.clip(action, self.action_space.low, self.action_space.high)
|
||||
return self.env.step(action)
|
||||
|
||||
def reset(self, **kwargs):
|
||||
return self.env.reset(**kwargs)
|
||||
@@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env):
|
||||
self.natural = natural
|
||||
# Start the first game
|
||||
self._reset() # Number of
|
||||
self.action_dim = 2
|
||||
self.n_actions = 2
|
||||
|
||||
def reset(self):
|
||||
return self._reset()
|
||||
|
||||
@@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
|
||||
self.shape = (4, 12)
|
||||
|
||||
nS = np.prod(self.shape)
|
||||
action_dim = 4
|
||||
n_actions = 4
|
||||
|
||||
# Cliff Location
|
||||
self._cliff = np.zeros(self.shape, dtype=np.bool)
|
||||
@@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
|
||||
P = {}
|
||||
for s in range(nS):
|
||||
position = np.unravel_index(s, self.shape)
|
||||
P[s] = { a : [] for a in range(action_dim) }
|
||||
P[s] = { a : [] for a in range(n_actions) }
|
||||
P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
|
||||
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
|
||||
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
|
||||
@@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
|
||||
isd = np.zeros(nS)
|
||||
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
|
||||
|
||||
super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
|
||||
super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd)
|
||||
|
||||
def render(self, mode='human', close=False):
|
||||
self._render(mode, close)
|
||||
|
||||
@@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv):
|
||||
self.shape = shape
|
||||
|
||||
nS = np.prod(shape)
|
||||
action_dim = 4
|
||||
n_actions = 4
|
||||
|
||||
MAX_Y = shape[0]
|
||||
MAX_X = shape[1]
|
||||
@@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv):
|
||||
y, x = it.multi_index
|
||||
|
||||
# P[s][a] = (prob, next_state, reward, is_done)
|
||||
P[s] = {a : [] for a in range(action_dim)}
|
||||
P[s] = {a : [] for a in range(n_actions)}
|
||||
|
||||
is_done = lambda s: s == 0 or s == (nS - 1)
|
||||
reward = 0.0 if is_done(s) else -1.0
|
||||
@@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv):
|
||||
# This should not be used in any model-free learning algorithm
|
||||
self.P = P
|
||||
|
||||
super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
|
||||
super(GridworldEnv, self).__init__(nS, n_actions, P, isd)
|
||||
|
||||
def _render(self, mode='human', close=False):
|
||||
""" Renders the current gridworld layout
|
||||
|
||||
@@ -17,31 +17,31 @@ class StochasticMDP:
|
||||
def __init__(self):
|
||||
self.end = False
|
||||
self.curr_state = 2
|
||||
self.action_dim = 2
|
||||
self.state_dim = 6
|
||||
self.n_actions = 2
|
||||
self.n_states = 6
|
||||
self.p_right = 0.5
|
||||
|
||||
def reset(self):
|
||||
self.end = False
|
||||
self.curr_state = 2
|
||||
state = np.zeros(self.state_dim)
|
||||
state = np.zeros(self.n_states)
|
||||
state[self.curr_state - 1] = 1.
|
||||
return state
|
||||
|
||||
def step(self, action):
|
||||
if self.curr_state != 1:
|
||||
if action == 1:
|
||||
if random.random() < self.p_right and self.curr_state < self.state_dim:
|
||||
if random.random() < self.p_right and self.curr_state < self.n_states:
|
||||
self.curr_state += 1
|
||||
else:
|
||||
self.curr_state -= 1
|
||||
|
||||
if action == 0:
|
||||
self.curr_state -= 1
|
||||
if self.curr_state == self.state_dim:
|
||||
if self.curr_state == self.n_states:
|
||||
self.end = True
|
||||
|
||||
state = np.zeros(self.state_dim)
|
||||
state = np.zeros(self.n_states)
|
||||
state[self.curr_state - 1] = 1.
|
||||
|
||||
if self.curr_state == 1:
|
||||
|
||||
@@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
|
||||
self.shape = (7, 10)
|
||||
|
||||
nS = np.prod(self.shape)
|
||||
action_dim = 4
|
||||
n_actions = 4
|
||||
|
||||
# Wind strength
|
||||
winds = np.zeros(self.shape)
|
||||
@@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
|
||||
P = {}
|
||||
for s in range(nS):
|
||||
position = np.unravel_index(s, self.shape)
|
||||
P[s] = { a : [] for a in range(action_dim) }
|
||||
P[s] = { a : [] for a in range(n_actions) }
|
||||
P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
|
||||
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
|
||||
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
|
||||
@@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
|
||||
isd = np.zeros(nS)
|
||||
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
|
||||
|
||||
super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
|
||||
super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)
|
||||
|
||||
def render(self, mode='human', close=False):
|
||||
self._render(mode, close)
|
||||
|
||||
Reference in New Issue
Block a user