update codes

This commit is contained in:
johnjim0816
2021-12-22 16:55:09 +08:00
parent 75df999258
commit 41fb561d25
75 changed files with 1248 additions and 918 deletions

View File

@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
class A2C:
''' A2C算法
'''
def __init__(self,state_dim,action_dim,cfg) -> None:
def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma
self.device = cfg.device
self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters())
def compute_returns(self,next_value, rewards, masks):

View File

@@ -74,9 +74,9 @@ def train(cfg,envs):
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
env = gym.make(cfg.env_name) # a single env
env.seed(10)
state_dim = envs.observation_space.shape[0]
action_dim = envs.action_space.n
model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
n_states = envs.observation_space.shape[0]
n_actions = envs.action_space.n
model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
optimizer = optim.Adam(model.parameters())
frame_idx = 0
test_rewards = []

View File

@@ -39,15 +39,15 @@ class OUNoise(object):
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.action_dim = action_space.shape[0]
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.action_dim) * self.mu
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):

View File

@@ -50,15 +50,15 @@ import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
def __init__(self, state_dim=4, action_dim=18):
def __init__(self, n_states=4, n_actions=18):
""" 初始化q网络为全连接网络
state_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
"""
super(FCN, self).__init__()
self.fc1 = nn.Linear(state_dim, 128) # 输入层
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, action_dim) # 输出层
self.fc3 = nn.Linear(128, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
x = F.relu(self.fc2(x))
return self.fc3(x)
```
输入为state_dim输出为action_dim包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
输入为n_states,输出为n_actions包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
### Replay Buffer
@@ -107,8 +107,8 @@ class ReplayBuffer:
在类中建立两个网络以及optimizer和memory
```python
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
if random.random() > self.epsilon(self.frame_idx):
action = self.predict(state)
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2021-09-15 13:35:36
LastEditTime: 2021-12-22 14:01:37
@Discription:
@Environment: python 3.7.7
'''
@@ -21,15 +21,15 @@ import math
import numpy as np
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
state_dim: 输入的特征数即环境的状态数
action_dim: 输出的动作维度
n_states: 输入的特征数即环境的状态数
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -62,9 +62,9 @@ class ReplayBuffer:
return len(self.buffer)
class DQN:
def __init__(self, state_dim, action_dim, cfg):
def __init__(self, n_states, n_actions, cfg):
self.action_dim = action_dim # 总的动作个数
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -90,7 +90,7 @@ class DQN:
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略

133
codes/DQN/dqn_cnn.py Normal file
View File

@@ -0,0 +1,133 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import random
import math
class CNN(nn.Module):
def __init__(self, input_dim, output_dim):
super(CNN, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.features = nn.Sequential(
nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU()
)
self.fc = nn.Sequential(
nn.Linear(self.feature_size(), 512),
nn.ReLU(),
nn.Linear(512, self.output_dim)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def feature_size(self):
return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1)
def act(self, state, epsilon):
if random.random() > epsilon:
state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
q_value = self.forward(state)
action = q_value.max(1)[1].data[0]
else:
action = random.randrange(env.action_space.n)
return action
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
def choose_action(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
# 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
# 优化更新模型
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

View File

@@ -9,11 +9,10 @@ import torch
import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from DQN.agent import DQN
from DQN.train import train,test
from DQN.dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = "DQN" # 算法名称
algo_name = 'DQN' # 算法名称
env_name = 'CartPole-v0' # 环境名称
class DQNConfig:
@@ -51,25 +50,82 @@ def env_agent_config(cfg, seed=1):
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态数
action_dim = env.action_space.n # 动作数
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

View File

@@ -1,3 +1,13 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17
LastEditor: JiangJi
LastEditTime: 2021-12-22 11:40:44
Discription: 使用 Nature DQN 训练 CartPole-v1
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
@@ -9,9 +19,7 @@ import torch
import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn
from DQN.agent import DQN
from DQN.train import train,test
from DQN.dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = "DQN" # 算法名称
@@ -58,26 +66,83 @@ def env_agent_config(cfg, seed=1):
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态数
action_dim = env.action_space.n # 动作数
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

150
codes/DQN/task2.py Normal file
View File

@@ -0,0 +1,150 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17
LastEditor: JiangJi
LastEditTime: 2021-12-22 15:27:48
Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn
from common.atari_wrappers import make_atari, wrap_deepmind
from DQN.dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DQN-cnn' # 算法名称
env_name = 'PongNoFrameskip-v4' # 环境名称
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
class DQNConfig:
''' 算法相关参数设置
'''
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = device # 检测GPU
self.train_eps = 500 # 训练的回合数
self.test_eps = 30 # 测试的回合数
# 超参数
self.gamma = 0.95 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.0001 # 学习率
self.memory_capacity = 100000 # 经验回放的容量
self.batch_size = 64 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
class PlotConfig:
''' 绘图相关参数设置
'''
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = device # 检测GPU
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg, seed=1):
''' 创建环境和智能体
'''
env = make_atari(cfg.env_name) # 创建环境
# env = wrap_deepmind(env)
# env = wrap_pytorch(env)
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

File diff suppressed because one or more lines are too long

View File

@@ -1,138 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-12-22 11:08:04
@Discription:
@Environment: python 3.7.7
'''
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from DQN.agent import DQN
from DQN.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class DQNConfig:
def __init__(self):
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数
# 超参数
self.gamma = 0.95 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.0001 # 学习率
self.memory_capacity = 100000 # 经验回放的容量
self.batch_size = 64 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态数
action_dim = env.action_space.n # 动作数
agent = DQN(state_dim,action_dim,cfg) # 创建智能体
return env,agent
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards,ma_rewards = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag='test',path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards,ma_rewards, plot_cfg, tag="test") # 画出结果

View File

@@ -90,15 +90,15 @@ class OUNoise(object):
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.action_dim = action_space.shape[0]
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.action_dim) * self.mu
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):

View File

@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境如下图它通过向左(动作=0
import gym
env = gym.make('CartPole-v0') # 建立环境
env.seed(1) # 随机种子
state_dim = env.observation_space.shape[0] # 状态数
action_dim = env.action_space.n # 动作数
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
state = env.reset() # 初始化环境
print(f"状态数:{state_dim},动作数:{action_dim}")
print(f"状态数:{n_states},动作数:{n_actions}")
print(f"初始状态:{state}")
```
@@ -157,7 +157,7 @@ def choose_action(self, state):
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
```
可以看到跟Q学习算法其实是一样的都是用的$\epsilon-greedy$策略只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。

View File

@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
这里我们在程序中使用了一个装饰器重新定义环境但不影响对环境的理解感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可然后我们可以查看环境的状态和动作数目
```python
state_dim = env.observation_space.n # 状态数
action_dim = env.action_space.n # 动作数
print(f"状态数:{state_dim},动作数:{action_dim}")
n_states = env.observation_space.n # 状态数
n_actions = env.action_space.n # 动作数
print(f"状态数:{n_states},动作数:{n_actions}")
```
打印出来的结果如下:
@@ -72,9 +72,9 @@ print(state)
env = gym.make('CliffWalking-v0') # 定义环境
env = CliffWalkingWapper(env) # 装饰环境
env.seed(1) # 设置随机种子
state_dim = env.observation_space.n # 状态数
action_dim = env.action_space.n # 动作数
agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
n_states = env.observation_space.n # 状态数
n_actions = env.action_space.n # 动作数
agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境
@@ -126,7 +126,7 @@ def choose_action(self, state):
if np.random.uniform(0, 1) > self.epsilon:
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
else:
action = np.random.choice(self.action_dim) # 随机选择动作
action = np.random.choice(self.n_actions) # 随机选择动作
return action
```

View File

@@ -136,12 +136,12 @@
"outputs": [],
"source": [
"class DuelingNet(nn.Module):\n",
" def __init__(self, state_dim, action_dim,hidden_size=128):\n",
" def __init__(self, n_states, n_actions,hidden_size=128):\n",
" super(DuelingNet, self).__init__()\n",
" \n",
" # 隐藏层\n",
" self.hidden = nn.Sequential(\n",
" nn.Linear(state_dim, hidden_size),\n",
" nn.Linear(n_states, hidden_size),\n",
" nn.ReLU()\n",
" )\n",
" \n",
@@ -149,7 +149,7 @@
" self.advantage = nn.Sequential(\n",
" nn.Linear(hidden_size, hidden_size),\n",
" nn.ReLU(),\n",
" nn.Linear(hidden_size, action_dim)\n",
" nn.Linear(hidden_size, n_actions)\n",
" )\n",
" \n",
" # 价值函数\n",
@@ -192,7 +192,7 @@
],
"source": [
"class DuelingDQN:\n",
" def __init__(self,state_dim,action_dim,cfg) -> None:\n",
" def __init__(self,n_states,n_actions,cfg) -> None:\n",
" self.batch_size = cfg.batch_size\n",
" self.device = cfg.device\n",
" self.loss_history = [] # 记录loss的变化\n",
@@ -200,8 +200,8 @@
" self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
" (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
" math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
" self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
" target_param.data.copy_(param.data)\n",
" self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
@@ -214,7 +214,7 @@
" q_values = self.policy_net(state)\n",
" action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
" else:\n",
" action = random.randrange(self.action_dim)\n",
" action = random.randrange(self.n_actions)\n",
" return action\n",
" def update(self):\n",
" if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略\n",

5
codes/Logs.md Normal file
View File

@@ -0,0 +1,5 @@
## 记录笔者更新的日志
**2021.12.22-3**:将```agent.py```更改为对应的算法名称,便于区分如```dqn```与```dqn_cnn```的情况
**2021.12.22-2**:简化了代码结构,将原来的```train.py```和```task.py```等合并到```task.py```中
**2021.12.22-1**:简化了代码结构,将原来的```model.py```和```memory.py```等合并到```agent.py```中,```plot.py```的内容合并到```common.utils.py```中

View File

@@ -17,11 +17,11 @@ import dill
class FisrtVisitMC:
''' On-Policy First-Visit MC Control
'''
def __init__(self,action_dim,cfg):
self.action_dim = action_dim
def __init__(self,n_actions,cfg):
self.n_actions = n_actions
self.epsilon = cfg.epsilon
self.gamma = cfg.gamma
self.Q_table = defaultdict(lambda: np.zeros(action_dim))
self.Q_table = defaultdict(lambda: np.zeros(n_actions))
self.returns_sum = defaultdict(float) # sum of returns
self.returns_count = defaultdict(float)
@@ -29,11 +29,11 @@ class FisrtVisitMC:
''' e-greed policy '''
if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
else:
action = np.random.randint(0,self.action_dim)
action = np.random.randint(0,self.n_actions)
return action
def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition

View File

@@ -43,8 +43,8 @@ class MCConfig:
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
action_dim = 9
agent = FisrtVisitMC(action_dim, cfg)
n_actions = 9
agent = FisrtVisitMC(n_actions, cfg)
return env,agent
def train(cfg, env, agent):

View File

@@ -0,0 +1,52 @@
import torch
import torch.nn as nn
class NoisyLinear(nn.Module):
def __init__(self, input_dim, output_dim, std_init=0.4):
super(NoisyLinear, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.std_init = std_init
self.weight_mu = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
self.weight_sigma = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
self.register_buffer('weight_epsilon', torch.FloatTensor(output_dim, input_dim))
self.bias_mu = nn.Parameter(torch.FloatTensor(output_dim))
self.bias_sigma = nn.Parameter(torch.FloatTensor(output_dim))
self.register_buffer('bias_epsilon', torch.FloatTensor(output_dim))
self.reset_parameters()
self.reset_noise()
def forward(self, x):
if self.training:
weight = self.weight_mu + self.weight_sigma.mul( (self.weight_epsilon))
bias = self.bias_mu + self.bias_sigma.mul(Variable(self.bias_epsilon))
else:
weight = self.weight_mu
bias = self.bias_mu
return F.linear(x, weight, bias)
def reset_parameters(self):
mu_range = 1 / math.sqrt(self.weight_mu.size(1))
self.weight_mu.data.uniform_(-mu_range, mu_range)
self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
self.bias_mu.data.uniform_(-mu_range, mu_range)
self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
def reset_noise(self):
epsilon_in = self._scale_noise(self.input_dim)
epsilon_out = self._scale_noise(self.output_dim)
self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
self.bias_epsilon.copy_(self._scale_noise(self.output_dim))
def _scale_noise(self, size):
x = torch.randn(size)
x = x.sign().mul(x.abs().sqrt())
return x

View File

@@ -57,16 +57,16 @@ model就是actor和critic两个网络了
import torch.nn as nn
from torch.distributions.categorical import Categorical
class Actor(nn.Module):
def __init__(self,state_dim, action_dim,
def __init__(self,n_states, n_actions,
hidden_dim=256):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=-1)
)
def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
return dist
class Critic(nn.Module):
def __init__(self, state_dim,hidden_dim=256):
def __init__(self, n_states,hidden_dim=256):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
value = self.critic(state)
return value
```
这里Actor就是得到一个概率分布(Categorica也可以是别的分布可以搜索torch distributionsl)critc根据当前状态得到一个值这里的输入维度可以是```state_dim+action_dim```即将action信息也纳入critic网络中这样会更好一些感兴趣的小伙伴可以试试。
这里Actor就是得到一个概率分布(Categorica也可以是别的分布可以搜索torch distributionsl)critc根据当前状态得到一个值这里的输入维度可以是```n_states+n_actions```即将action信息也纳入critic网络中这样会更好一些感兴趣的小伙伴可以试试。
### PPO update
定义一个update函数主要实现伪代码中的第六步和第七步

View File

@@ -16,15 +16,15 @@ import torch.optim as optim
from PPO.model import Actor,Critic
from PPO.memory import PPOMemory
class PPO:
def __init__(self, state_dim, action_dim,cfg):
def __init__(self, n_states, n_actions,cfg):
self.gamma = cfg.gamma
self.continuous = cfg.continuous
self.policy_clip = cfg.policy_clip
self.n_epochs = cfg.n_epochs
self.gae_lambda = cfg.gae_lambda
self.device = cfg.device
self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
self.memory = PPOMemory(cfg.batch_size)

View File

@@ -12,16 +12,16 @@ Environment:
import torch.nn as nn
from torch.distributions.categorical import Categorical
class Actor(nn.Module):
def __init__(self,state_dim, action_dim,
def __init__(self,n_states, n_actions,
hidden_dim):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=-1)
)
def forward(self, state):
@@ -30,10 +30,10 @@ class Actor(nn.Module):
return dist
class Critic(nn.Module):
def __init__(self, state_dim,hidden_dim):
def __init__(self, n_states,hidden_dim):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),

View File

@@ -45,9 +45,9 @@ class PlotConfig:
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = PPO(n_states,n_actions,cfg)
return env,agent
cfg = PPOConfig()

View File

@@ -45,9 +45,9 @@ class PlotConfig:
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = PPO(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = PPO(n_states,n_actions,cfg)
return env,agent

View File

@@ -90,9 +90,9 @@
"def env_agent_config(cfg,seed=1):\n",
" env = gym.make(cfg.env) \n",
" env.seed(seed)\n",
" state_dim = env.observation_space.shape[0]\n",
" action_dim = env.action_space.n\n",
" agent = PPO(state_dim,action_dim,cfg)\n",
" n_states = env.observation_space.shape[0]\n",
" n_actions = env.action_space.n\n",
" agent = PPO(n_states,n_actions,cfg)\n",
" return env,agent"
]
},

View File

@@ -99,9 +99,9 @@ if __name__ == '__main__':
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = PPO(n_states,n_actions,cfg)
return env,agent
cfg = PPOConfig()

View File

@@ -17,9 +17,9 @@ from PolicyGradient.model import MLP
class PolicyGradient:
def __init__(self, state_dim,cfg):
def __init__(self, n_states,cfg):
self.gamma = cfg.gamma
self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.batch_size = cfg.batch_size

View File

@@ -19,7 +19,7 @@ class MLP(nn.Module):
'''
def __init__(self,input_dim,hidden_dim = 36):
super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据input_dim, action_dim的情况来改变
# 24和36为hidden layer的层数可根据input_dim, n_actions的情况来改变
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left

View File

@@ -46,8 +46,8 @@ class PGConfig:
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env)
env.seed(seed)
state_dim = env.observation_space.shape[0]
agent = PolicyGradient(state_dim,cfg)
n_states = env.observation_space.shape[0]
agent = PolicyGradient(n_states,cfg)
return env,agent
def train(cfg,env,agent):

View File

@@ -16,7 +16,7 @@
**注意:新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面,```plot```放到了```common.utils```中。**
## 运行环境
python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
python 3.7、pytorch 1.6.0-1.8.1、gym 0.21.0
## 使用说明
@@ -36,7 +36,7 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
| [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | |
| [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | |
| [A2C](./A2C) | [A3C Paper](https://arxiv.org/abs/1602.01783) | [CartPole-v0](./envs/gym_info.md) | |
| [SAC](./SAC) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
| [SAC](./SoftActorCritic) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
| [DDPG](./DDPG) | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
| [TD3](./TD3) | [TD3 Paper](https://arxiv.org/abs/1802.09477) | [HalfCheetah-v2]((./envs/mujoco_info.md)) | |

View File

@@ -1,110 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-04-29 12:53:54
LastEditor: JiangJi
LastEditTime: 2021-04-29 13:56:39
Discription:
Environment:
'''
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from common.memory import ReplayBuffer
from SAC.model import ValueNet,PolicyNet,SoftQNet
class SAC:
def __init__(self,state_dim,action_dim,cfg) -> None:
self.batch_size = cfg.batch_size
self.memory = ReplayBuffer(cfg.capacity)
self.device = cfg.device
self.value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device)
self.target_value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device)
self.soft_q_net = SoftQNet(state_dim, action_dim, cfg.hidden_dim).to(self.device)
self.policy_net = PolicyNet(state_dim, action_dim, cfg.hidden_dim).to(self.device)
self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
target_param.data.copy_(param.data)
self.value_criterion = nn.MSELoss()
self.soft_q_criterion = nn.MSELoss()
def update(self, gamma=0.99,mean_lambda=1e-3,
std_lambda=1e-3,
z_lambda=0.0,
soft_tau=1e-2,
):
if len(self.memory) < self.batch_size:
return
state, action, reward, next_state, done = self.memory.sample(self.batch_size)
state = torch.FloatTensor(state).to(self.device)
next_state = torch.FloatTensor(next_state).to(self.device)
action = torch.FloatTensor(action).to(self.device)
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
expected_q_value = self.soft_q_net(state, action)
expected_value = self.value_net(state)
new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
target_value = self.target_value_net(next_state)
next_q_value = reward + (1 - done) * gamma * target_value
q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
expected_new_q_value = self.soft_q_net(state, new_action)
next_value = expected_new_q_value - log_prob
value_loss = self.value_criterion(expected_value, next_value.detach())
log_prob_target = expected_new_q_value - expected_value
policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
mean_loss = mean_lambda * mean.pow(2).mean()
std_loss = std_lambda * log_std.pow(2).mean()
z_loss = z_lambda * z.pow(2).sum(1).mean()
policy_loss += mean_loss + std_loss + z_loss
self.soft_q_optimizer.zero_grad()
q_value_loss.backward()
self.soft_q_optimizer.step()
self.value_optimizer.zero_grad()
value_loss.backward()
self.value_optimizer.step()
self.policy_optimizer.zero_grad()
policy_loss.backward()
self.policy_optimizer.step()
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
target_param.data.copy_(
target_param.data * (1.0 - soft_tau) + param.data * soft_tau
)
def save(self, path):
torch.save(self.value_net.state_dict(), path + "sac_value")
torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer")
torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q")
torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer")
torch.save(self.policy_net.state_dict(), path + "sac_policy")
torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer")
def load(self, path):
self.value_net.load_state_dict(torch.load(path + "sac_value"))
self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer"))
self.target_value_net = copy.deepcopy(self.value_net)
self.soft_q_net.load_state_dict(torch.load(path + "sac_soft_q"))
self.soft_q_optimizer.load_state_dict(torch.load(path + "sac_soft_q_optimizer"))
self.policy_net.load_state_dict(torch.load(path + "sac_policy"))
self.policy_optimizer.load_state_dict(torch.load(path + "sac_policy_optimizer"))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

View File

@@ -14,17 +14,17 @@ from collections import defaultdict
import torch
class Sarsa(object):
def __init__(self,
action_dim,sarsa_cfg,):
self.action_dim = action_dim # number of actions
n_actions,sarsa_cfg,):
self.n_actions = n_actions # number of actions
self.lr = sarsa_cfg.lr # learning rate
self.gamma = sarsa_cfg.gamma
self.epsilon = sarsa_cfg.epsilon
self.Q = defaultdict(lambda: np.zeros(action_dim))
# self.Q = np.zeros((state_dim, action_dim)) # Q表
self.Q = defaultdict(lambda: np.zeros(n_actions))
# self.Q = np.zeros((n_states, n_actions)) # Q表
def choose_action(self, state):
best_action = np.argmax(self.Q[state])
# action = best_action
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
return action

View File

@@ -39,8 +39,8 @@ class SarsaConfig:
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
action_dim=9
agent = Sarsa(action_dim,cfg)
n_actions=9
agent = Sarsa(n_actions,cfg)
return env,agent
def train(cfg,env,agent):

View File

@@ -5,12 +5,13 @@ Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-04-29 12:52:11
LastEditor: JiangJi
LastEditTime: 2021-04-29 12:52:31
LastEditTime: 2021-12-22 15:36:36
Discription:
Environment:
'''
import gym
import numpy as np
class NormalizedActions(gym.ActionWrapper):
def action(self, action):
low = self.action_space.low

View File

@@ -17,10 +17,10 @@ from torch.distributions import Normal
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ValueNet(nn.Module):
def __init__(self, state_dim, hidden_dim, init_w=3e-3):
def __init__(self, n_states, hidden_dim, init_w=3e-3):
super(ValueNet, self).__init__()
self.linear1 = nn.Linear(state_dim, hidden_dim)
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
@@ -35,10 +35,10 @@ class ValueNet(nn.Module):
class SoftQNet(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(SoftQNet, self).__init__()
self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
@@ -54,20 +54,20 @@ class SoftQNet(nn.Module):
class PolicyNet(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
super(PolicyNet, self).__init__()
self.log_std_min = log_std_min
self.log_std_max = log_std_max
self.linear1 = nn.Linear(state_dim, hidden_dim)
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.mean_linear = nn.Linear(hidden_dim, action_dim)
self.mean_linear = nn.Linear(hidden_dim, n_actions)
self.mean_linear.weight.data.uniform_(-init_w, init_w)
self.mean_linear.bias.data.uniform_(-init_w, init_w)
self.log_std_linear = nn.Linear(hidden_dim, action_dim)
self.log_std_linear = nn.Linear(hidden_dim, n_actions)
self.log_std_linear.weight.data.uniform_(-init_w, init_w)
self.log_std_linear.bias.data.uniform_(-init_w, init_w)

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

View File

@@ -0,0 +1,222 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-04-29 12:53:54
LastEditor: JiangJi
LastEditTime: 2021-12-22 15:41:19
Discription:
Environment:
'''
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np
import random
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class ValueNet(nn.Module):
def __init__(self, n_states, hidden_dim, init_w=3e-3):
super(ValueNet, self).__init__()
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, state):
x = F.relu(self.linear1(state))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class SoftQNet(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(SoftQNet, self).__init__()
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, state, action):
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class PolicyNet(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
super(PolicyNet, self).__init__()
self.log_std_min = log_std_min
self.log_std_max = log_std_max
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.mean_linear = nn.Linear(hidden_dim, n_actions)
self.mean_linear.weight.data.uniform_(-init_w, init_w)
self.mean_linear.bias.data.uniform_(-init_w, init_w)
self.log_std_linear = nn.Linear(hidden_dim, n_actions)
self.log_std_linear.weight.data.uniform_(-init_w, init_w)
self.log_std_linear.bias.data.uniform_(-init_w, init_w)
def forward(self, state):
x = F.relu(self.linear1(state))
x = F.relu(self.linear2(x))
mean = self.mean_linear(x)
log_std = self.log_std_linear(x)
log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
return mean, log_std
def evaluate(self, state, epsilon=1e-6):
mean, log_std = self.forward(state)
std = log_std.exp()
normal = Normal(mean, std)
z = normal.sample()
action = torch.tanh(z)
log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
log_prob = log_prob.sum(-1, keepdim=True)
return action, log_prob, z, mean, log_std
def get_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0).to(device)
mean, log_std = self.forward(state)
std = log_std.exp()
normal = Normal(mean, std)
z = normal.sample()
action = torch.tanh(z)
action = action.detach().cpu().numpy()
return action[0]
class SAC:
def __init__(self,n_states,n_actions,cfg) -> None:
self.batch_size = cfg.batch_size
self.memory = ReplayBuffer(cfg.capacity)
self.device = cfg.device
self.value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device)
self.target_value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device)
self.soft_q_net = SoftQNet(n_states, n_actions, cfg.hidden_dim).to(self.device)
self.policy_net = PolicyNet(n_states, n_actions, cfg.hidden_dim).to(self.device)
self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
target_param.data.copy_(param.data)
self.value_criterion = nn.MSELoss()
self.soft_q_criterion = nn.MSELoss()
def update(self, gamma=0.99,mean_lambda=1e-3,
std_lambda=1e-3,
z_lambda=0.0,
soft_tau=1e-2,
):
if len(self.memory) < self.batch_size:
return
state, action, reward, next_state, done = self.memory.sample(self.batch_size)
state = torch.FloatTensor(state).to(self.device)
next_state = torch.FloatTensor(next_state).to(self.device)
action = torch.FloatTensor(action).to(self.device)
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
expected_q_value = self.soft_q_net(state, action)
expected_value = self.value_net(state)
new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
target_value = self.target_value_net(next_state)
next_q_value = reward + (1 - done) * gamma * target_value
q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
expected_new_q_value = self.soft_q_net(state, new_action)
next_value = expected_new_q_value - log_prob
value_loss = self.value_criterion(expected_value, next_value.detach())
log_prob_target = expected_new_q_value - expected_value
policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
mean_loss = mean_lambda * mean.pow(2).mean()
std_loss = std_lambda * log_std.pow(2).mean()
z_loss = z_lambda * z.pow(2).sum(1).mean()
policy_loss += mean_loss + std_loss + z_loss
self.soft_q_optimizer.zero_grad()
q_value_loss.backward()
self.soft_q_optimizer.step()
self.value_optimizer.zero_grad()
value_loss.backward()
self.value_optimizer.step()
self.policy_optimizer.zero_grad()
policy_loss.backward()
self.policy_optimizer.step()
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
target_param.data.copy_(
target_param.data * (1.0 - soft_tau) + param.data * soft_tau
)
def save(self, path):
torch.save(self.value_net.state_dict(), path + "sac_value")
torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer")
torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q")
torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer")
torch.save(self.policy_net.state_dict(), path + "sac_policy")
torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer")
def load(self, path):
self.value_net.load_state_dict(torch.load(path + "sac_value"))
self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer"))
self.target_value_net = copy.deepcopy(self.value_net)
self.soft_q_net.load_state_dict(torch.load(path + "sac_soft_q"))
self.soft_q_optimizer.load_state_dict(torch.load(path + "sac_soft_q_optimizer"))
self.policy_net.load_state_dict(torch.load(path + "sac_policy"))
self.policy_optimizer.load_state_dict(torch.load(path + "sac_policy_optimizer"))

View File

@@ -5,7 +5,7 @@ Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-04-29 12:59:22
LastEditor: JiangJi
LastEditTime: 2021-05-06 16:58:01
LastEditTime: 2021-12-22 16:27:13
Discription:
Environment:
'''
@@ -18,23 +18,24 @@ import gym
import torch
import datetime
from SAC.env import NormalizedActions
from SAC.agent import SAC
from SoftActorCritic.env_wrapper import NormalizedActions
from SoftActorCritic.sac import SAC
from common.utils import save_results, make_dir
from common.plot import plot_rewards
from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'SAC' # 算法名称
env_name = 'Pendulum-v1' # 环境名称
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
class SACConfig:
def __init__(self) -> None:
self.algo = 'SAC'
self.env_name = 'Pendulum-v1'
self.result_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/results/' # path to save results
self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models
self.algo_name = algo_name
self.env_name = env_name # 环境名称
self.device= device
self.train_eps = 300
self.train_steps = 500
self.test_eps = 50
self.eval_steps = 500
self.test_eps = 20
self.max_steps = 500 # 每回合的最大步数
self.gamma = 0.99
self.mean_lambda=1e-3
self.std_lambda=1e-3
@@ -46,33 +47,36 @@ class SACConfig:
self.capacity = 1000000
self.hidden_dim = 256
self.batch_size = 128
self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
class PlotConfig(SACConfig):
def __init__(self) -> None:
super().__init__()
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
class PlotConfig:
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device= device
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name))
env.seed(seed)
action_dim = env.action_space.shape[0]
state_dim = env.observation_space.shape[0]
agent = SAC(state_dim,action_dim,cfg)
n_actions = env.action_space.shape[0]
n_states = env.observation_space.shape[0]
agent = SAC(n_states,n_actions,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
for i_step in range(cfg.train_steps):
for i_step in range(cfg.max_steps):
action = agent.policy_net.get_action(state)
next_state, reward, done, _ = env.step(action)
agent.memory.push(state, action, reward, next_state, done)
@@ -81,57 +85,57 @@ def train(cfg,env,agent):
ep_reward += reward
if done:
break
if (i_ep+1)%10==0:
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Complete training')
if (i_ep+1)%10 == 0:
print(f'回合:{i_ep+1}/{cfg.train_eps}, 奖励:{ep_reward:.3f}')
print('完成训练!')
return rewards, ma_rewards
def eval(cfg,env,agent):
print('Start to eval !')
print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}')
rewards = []
ma_rewards = [] # moveing average reward
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for i_step in range(cfg.eval_steps):
for i_step in range(cfg.max_steps):
action = agent.policy_net.get_action(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
if (i_ep+1)%10==0:
print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Complete evaling')
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards, ma_rewards
if __name__ == "__main__":
cfg=SACConfig()
plot_cfg = PlotConfig()
# train
env,agent = env_agent_config(cfg,seed=1)
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path)
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# eval
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

View File

@@ -70,9 +70,9 @@
"def env_agent_config(cfg,seed=1):\n",
" env = NormalizedActions(gym.make(\"Pendulum-v0\"))\n",
" env.seed(seed)\n",
" action_dim = env.action_space.shape[0]\n",
" state_dim = env.observation_space.shape[0]\n",
" agent = SAC(state_dim,action_dim,cfg)\n",
" n_actions = env.action_space.shape[0]\n",
" n_states = env.observation_space.shape[0]\n",
" agent = SAC(n_states,n_actions,cfg)\n",
" return env,agent"
]
},
@@ -159,7 +159,7 @@
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mDeprecatedEnv\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-91b1038013e4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0magent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mrewards\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma_rewards\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0magent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mmake_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-4-040773221550>\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0maction_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstate_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-4-040773221550>\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_actions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mn_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(id, **kwargs)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mregistry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(self, path, **kwargs)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Making new env: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mspec\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 185\u001b[0m raise error.DeprecatedEnv(\n\u001b[1;32m 186\u001b[0m \"Env {} not found (valid versions include {})\".format(\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmatching_envs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m )\n\u001b[1;32m 189\u001b[0m )\n",

View File

@@ -14,13 +14,13 @@ import torch
class ReplayBuffer(object):
def __init__(self, state_dim, action_dim, max_size=int(1e6)):
def __init__(self, n_states, n_actions, max_size=int(1e6)):
self.max_size = max_size
self.ptr = 0
self.size = 0
self.state = np.zeros((max_size, state_dim))
self.action = np.zeros((max_size, action_dim))
self.next_state = np.zeros((max_size, state_dim))
self.state = np.zeros((max_size, n_states))
self.action = np.zeros((max_size, n_actions))
self.next_state = np.zeros((max_size, n_states))
self.reward = np.zeros((max_size, 1))
self.not_done = np.zeros((max_size, 1))
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

View File

@@ -74,10 +74,10 @@ if __name__ == "__main__":
env.seed(cfg.seed) # Set seeds
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
td3= TD3(state_dim,action_dim,max_action,cfg)
td3= TD3(n_states,n_actions,max_action,cfg)
cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
td3.load(cfg.model_path)
td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed)

View File

@@ -72,7 +72,7 @@ def train(cfg,env,agent):
else:
action = (
agent.choose_action(np.array(state))
+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
).clip(-max_action, max_action)
# Perform action
next_state, reward, done, _ = env.step(action)
@@ -121,11 +121,11 @@ def train(cfg,env,agent):
# else:
# action = (
# agent.choose_action(np.array(state))
# + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
# + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
# ).clip(-max_action, max_action)
# # action = (
# # agent.choose_action(np.array(state))
# # + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
# # + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
# # ).clip(-max_action, max_action)
# # Perform action
# next_state, reward, done, _ = env.step(action)
@@ -157,10 +157,10 @@ if __name__ == "__main__":
env.seed(cfg.seed) # Set seeds
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
agent = TD3(state_dim,action_dim,max_action,cfg)
agent = TD3(n_states,n_actions,max_action,cfg)
rewards,ma_rewards = train(cfg,env,agent)
make_dir(cfg.result_path,cfg.model_path)
agent.save(path=cfg.model_path)

View File

@@ -70,10 +70,10 @@ if __name__ == "__main__":
env.seed(cfg.seed) # Set seeds
torch.manual_seed(cfg.seed)
np.random.seed(cfg.seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
td3= TD3(state_dim,action_dim,max_action,cfg)
td3= TD3(n_states,n_actions,max_action,cfg)
cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/'
cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/'
td3.load(cfg.model_path)

View File

@@ -79,7 +79,7 @@ def train(cfg,env,agent):
else:
action = (
agent.choose_action(np.array(state))
+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
).clip(-max_action, max_action)
# Perform action
next_state, reward, done, _ = env.step(action)
@@ -109,10 +109,10 @@ if __name__ == "__main__":
env.seed(1) # 随机种子
torch.manual_seed(1)
np.random.seed(1)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
agent = TD3(state_dim,action_dim,max_action,cfg)
agent = TD3(n_states,n_actions,max_action,cfg)
rewards,ma_rewards = train(cfg,env,agent)
make_dir(plot_cfg.result_path,plot_cfg.model_path)
agent.save(path=plot_cfg.model_path)

View File

@@ -0,0 +1,284 @@
import numpy as np
import os
os.environ.setdefault('PATH', '')
from collections import deque
import gym
from gym import spaces
import cv2
cv2.ocl.setUseOpenCL(False)
from .wrappers import TimeLimit
class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
gym.Wrapper.__init__(self, env)
self.noop_max = noop_max
self.override_num_noops = None
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset(**kwargs)
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(self.noop_action)
if done:
obs = self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset for environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env):
"""Make end-of-life == end-of-episode, but only reset on true game over.
Done by DeepMind for the DQN and co. since it helps value estimation.
"""
gym.Wrapper.__init__(self, env)
self.lives = 0
self.was_real_done = True
def step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert sometimes we stay in lives == 0 condition for a few frames
# so it's important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info
def reset(self, **kwargs):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset(**kwargs)
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.lives = self.env.unwrapped.ale.lives()
return obs
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
self._skip = skip
def step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2: self._obs_buffer[0] = obs
if i == self._skip - 1: self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class ClipRewardEnv(gym.RewardWrapper):
def __init__(self, env):
gym.RewardWrapper.__init__(self, env)
def reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
return np.sign(reward)
class WarpFrame(gym.ObservationWrapper):
def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
"""
Warp frames to 84x84 as done in the Nature paper and later work.
If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
observation should be warped.
"""
super().__init__(env)
self._width = width
self._height = height
self._grayscale = grayscale
self._key = dict_space_key
if self._grayscale:
num_colors = 1
else:
num_colors = 3
new_space = gym.spaces.Box(
low=0,
high=255,
shape=(self._height, self._width, num_colors),
dtype=np.uint8,
)
if self._key is None:
original_space = self.observation_space
self.observation_space = new_space
else:
original_space = self.observation_space.spaces[self._key]
self.observation_space.spaces[self._key] = new_space
assert original_space.dtype == np.uint8 and len(original_space.shape) == 3
def observation(self, obs):
if self._key is None:
frame = obs
else:
frame = obs[self._key]
if self._grayscale:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(
frame, (self._width, self._height), interpolation=cv2.INTER_AREA
)
if self._grayscale:
frame = np.expand_dims(frame, -1)
if self._key is None:
obs = frame
else:
obs = obs.copy()
obs[self._key] = frame
return obs
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames.
Returns lazy array, which is much more memory efficient.
See Also
--------
baselines.common.atari_wrappers.LazyFrames
"""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return LazyFrames(list(self.frames))
class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
def observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
class LazyFrames(object):
def __init__(self, frames):
"""This object ensures that common frames between the observations are only stored once.
It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
buffers.
This object should only be converted to numpy array before being passed to the model.
You'd not believe how complex the previous solution was."""
self._frames = frames
self._out = None
def _force(self):
if self._out is None:
self._out = np.concatenate(self._frames, axis=-1)
self._frames = None
return self._out
def __array__(self, dtype=None):
out = self._force()
if dtype is not None:
out = out.astype(dtype)
return out
def __len__(self):
return len(self._force())
def __getitem__(self, i):
return self._force()[i]
def count(self):
frames = self._force()
return frames.shape[frames.ndim - 1]
def frame(self, i):
return self._force()[..., i]
def make_atari(env_id, max_episode_steps=None):
env = gym.make(env_id)
assert 'NoFrameskip' in env.spec.id
env = NoopResetEnv(env, noop_max=30)
env = MaxAndSkipEnv(env, skip=4)
if max_episode_steps is not None:
env = TimeLimit(env, max_episode_steps=max_episode_steps)
return env
def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
"""Configure environment for DeepMind-style Atari.
"""
if episode_life:
env = EpisodicLifeEnv(env)
if 'FIRE' in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = WarpFrame(env)
if scale:
env = ScaledFloatFrame(env)
if clip_rewards:
env = ClipRewardEnv(env)
if frame_stack:
env = FrameStack(env, 4)
return env

View File

@@ -32,10 +32,10 @@ class MLP(nn.Module):
return self.fc3(x)
class Critic(nn.Module):
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
super(Critic, self).__init__()
self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, 1)
# 随机初始化为较小的值
@@ -51,11 +51,11 @@ class Critic(nn.Module):
return x
class Actor(nn.Module):
def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, action_dim)
self.linear3 = nn.Linear(hidden_size, n_actions)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ class Actor(nn.Module):
return x
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim=256):
def __init__(self, n_states, n_actions, hidden_dim=256):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=1),
)

29
codes/common/wrappers.py Normal file
View File

@@ -0,0 +1,29 @@
import gym
class TimeLimit(gym.Wrapper):
def __init__(self, env, max_episode_steps=None):
super(TimeLimit, self).__init__(env)
self._max_episode_steps = max_episode_steps
self._elapsed_steps = 0
def step(self, ac):
observation, reward, done, info = self.env.step(ac)
self._elapsed_steps += 1
if self._elapsed_steps >= self._max_episode_steps:
done = True
info['TimeLimit.truncated'] = True
return observation, reward, done, info
def reset(self, **kwargs):
self._elapsed_steps = 0
return self.env.reset(**kwargs)
class ClipActionsWrapper(gym.Wrapper):
def step(self, action):
import numpy as np
action = np.nan_to_num(action)
action = np.clip(action, self.action_space.low, self.action_space.high)
return self.env.step(action)
def reset(self, **kwargs):
return self.env.reset(**kwargs)

View File

@@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env):
self.natural = natural
# Start the first game
self._reset() # Number of
self.action_dim = 2
self.n_actions = 2
def reset(self):
return self._reset()

View File

@@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
self.shape = (4, 12)
nS = np.prod(self.shape)
action_dim = 4
n_actions = 4
# Cliff Location
self._cliff = np.zeros(self.shape, dtype=np.bool)
@@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
P = {}
for s in range(nS):
position = np.unravel_index(s, self.shape)
P[s] = { a : [] for a in range(action_dim) }
P[s] = { a : [] for a in range(n_actions) }
P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
@@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
isd = np.zeros(nS)
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd)
def render(self, mode='human', close=False):
self._render(mode, close)

View File

@@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv):
self.shape = shape
nS = np.prod(shape)
action_dim = 4
n_actions = 4
MAX_Y = shape[0]
MAX_X = shape[1]
@@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv):
y, x = it.multi_index
# P[s][a] = (prob, next_state, reward, is_done)
P[s] = {a : [] for a in range(action_dim)}
P[s] = {a : [] for a in range(n_actions)}
is_done = lambda s: s == 0 or s == (nS - 1)
reward = 0.0 if is_done(s) else -1.0
@@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv):
# This should not be used in any model-free learning algorithm
self.P = P
super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
super(GridworldEnv, self).__init__(nS, n_actions, P, isd)
def _render(self, mode='human', close=False):
""" Renders the current gridworld layout

View File

@@ -17,31 +17,31 @@ class StochasticMDP:
def __init__(self):
self.end = False
self.curr_state = 2
self.action_dim = 2
self.state_dim = 6
self.n_actions = 2
self.n_states = 6
self.p_right = 0.5
def reset(self):
self.end = False
self.curr_state = 2
state = np.zeros(self.state_dim)
state = np.zeros(self.n_states)
state[self.curr_state - 1] = 1.
return state
def step(self, action):
if self.curr_state != 1:
if action == 1:
if random.random() < self.p_right and self.curr_state < self.state_dim:
if random.random() < self.p_right and self.curr_state < self.n_states:
self.curr_state += 1
else:
self.curr_state -= 1
if action == 0:
self.curr_state -= 1
if self.curr_state == self.state_dim:
if self.curr_state == self.n_states:
self.end = True
state = np.zeros(self.state_dim)
state = np.zeros(self.n_states)
state[self.curr_state - 1] = 1.
if self.curr_state == 1:

View File

@@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
self.shape = (7, 10)
nS = np.prod(self.shape)
action_dim = 4
n_actions = 4
# Wind strength
winds = np.zeros(self.shape)
@@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
P = {}
for s in range(nS):
position = np.unravel_index(s, self.shape)
P[s] = { a : [] for a in range(action_dim) }
P[s] = { a : [] for a in range(n_actions) }
P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
@@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
isd = np.zeros(nS)
isd[np.ravel_multi_index((3,0), self.shape)] = 1.0
super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)
def render(self, mode='human', close=False):
self._render(mode, close)