update rainbowdqn

This commit is contained in:
johnjim0816
2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions

View File

@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
class A2C:
''' A2C算法
'''
def __init__(self,state_dim,action_dim,cfg) -> None:
def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma
self.device = cfg.device
self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters())
def compute_returns(self,next_value, rewards, masks):

View File

@@ -10,7 +10,7 @@ import torch
import torch.optim as optim
import datetime
from common.multiprocessing_env import SubprocVecEnv
from A2C.agent import ActorCritic
from a2c import ActorCritic
from common.utils import save_results, make_dir
from common.utils import plot_rewards
@@ -74,9 +74,9 @@ def train(cfg,envs):
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
env = gym.make(cfg.env_name) # a single env
env.seed(10)
state_dim = envs.observation_space.shape[0]
action_dim = envs.action_space.n
model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
n_states = envs.observation_space.shape[0]
n_actions = envs.action_space.n
model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
optimizer = optim.Adam(model.parameters())
frame_idx = 0
test_rewards = []

View File

@@ -39,11 +39,11 @@ class ReplayBuffer:
'''
return len(self.buffer)
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(state_dim, hidden_dim)
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, action_dim)
self.linear3 = nn.Linear(hidden_dim, n_actions)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -54,10 +54,10 @@ class Actor(nn.Module):
x = torch.tanh(self.linear3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Critic, self).__init__()
self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
# 随机初始化为较小的值
@@ -72,12 +72,12 @@ class Critic(nn.Module):
x = self.linear3(x)
return x
class DDPG:
def __init__(self, state_dim, action_dim, cfg):
def __init__(self, n_states, n_actions, cfg):
self.device = cfg.device
self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
# 复制参数到目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):

View File

@@ -39,15 +39,15 @@ class OUNoise(object):
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.action_dim = action_space.shape[0]
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.action_dim) * self.mu
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2021-09-16 01:31:33
LastEditTime: 2022-02-10 06:23:27
@Discription:
@Environment: python 3.7.7
'''
@@ -18,23 +18,29 @@ import datetime
import gym
import torch
from DDPG.env import NormalizedActions
from DDPG.agent import DDPG
from env import NormalizedActions,OUNoise
from ddpg import DDPG
from DDPG.train import train,test
from common.utils import save_results,make_dir
from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DDPG' # 算法名称
env_name = 'Pendulum-v1' # 环境名称gym新版本约0.21.0之后中Pendulum-v0改为Pendulum-v1
class Config:
'''超参数
'''
class DDPGConfig:
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
################################## 环境超参数 ###################################
self.algo_name = 'DDPG' # 算法名称
self.env_name = 'Pendulum-v1' # 环境名称gym新版本约0.21.0之后中Pendulum-v0改为Pendulum-v1
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 300 # 训练的回合数
self.test_eps = 50 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 折扣因子
self.critic_lr = 1e-3 # 评论家网络的学习率
self.actor_lr = 1e-4 # 演员网络的学习率
@@ -43,39 +49,92 @@ class DDPGConfig:
self.target_update = 2 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层维度
self.soft_tau = 1e-2 # 软更新参数
################################################################################
class PlotConfig:
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
################################################################################
def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
env.seed(seed) # 随机种子
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = DDPG(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg)
return env,agent
def train(cfg, env, agent):
print('开始训练!')
print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(action, i_step)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
cfg = DDPGConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path)
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = test(plot_cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

View File

@@ -1,64 +0,0 @@
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
from DDPG.env import OUNoise
def train(cfg, env, agent):
print('开始训练!')
print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(action, i_step)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards, ma_rewards

View File

@@ -50,15 +50,15 @@ import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
def __init__(self, state_dim=4, action_dim=18):
def __init__(self, n_states=4, n_actions=18):
""" 初始化q网络为全连接网络
state_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
"""
super(FCN, self).__init__()
self.fc1 = nn.Linear(state_dim, 128) # 输入层
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, action_dim) # 输出层
self.fc3 = nn.Linear(128, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
x = F.relu(self.fc2(x))
return self.fc3(x)
```
输入为state_dim输出为action_dim包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
输入为n_states,输出为n_actions包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
### Replay Buffer
@@ -107,8 +107,8 @@ class ReplayBuffer:
在类中建立两个网络以及optimizer和memory
```python
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
if random.random() > self.epsilon(self.frame_idx):
action = self.predict(state)
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2021-12-22 14:01:37
LastEditTime: 2022-03-02 11:05:11
@Discription:
@Environment: python 3.7.7
'''
@@ -20,22 +20,7 @@ import random
import math
import numpy as np
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
""" 初始化q网络为全连接网络
state_dim: 输入的特征数即环境的状态维度
action_dim: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class ReplayBuffer:
def __init__(self, capacity):
@@ -62,9 +47,9 @@ class ReplayBuffer:
return len(self.buffer)
class DQN:
def __init__(self, state_dim, action_dim, cfg):
def __init__(self, n_actions,model,cfg):
self.action_dim = action_dim # 总的动作个数
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
@@ -73,8 +58,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = model.to(self.device)
self.target_net = model.to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -86,23 +71,24 @@ class DQN:
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
# print('updating')
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值

View File

@@ -70,9 +70,9 @@ class ReplayBuffer:
return len(self.buffer)
class DQN:
def __init__(self, state_dim, action_dim, cfg):
def __init__(self, n_states, n_actions, cfg):
self.action_dim = action_dim # 总的动作个数
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
@@ -81,8 +81,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(state_dim, action_dim).to(self.device)
self.target_net = CNN(state_dim, action_dim).to(self.device)
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -94,11 +94,12 @@ class DQN:
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
print(type(state))
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略

142
codes/DQN/dqn_cnn2.py Normal file
View File

@@ -0,0 +1,142 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import random
import math
import numpy as np
class CNN(nn.Module):
def __init__(self, n_frames, n_actions):
super(CNN,self).__init__()
self.n_frames = n_frames
self.n_actions = n_actions
# Layers
self.conv1 = nn.Conv2d(
in_channels=n_frames,
out_channels=16,
kernel_size=8,
stride=4,
padding=2
)
self.conv2 = nn.Conv2d(
in_channels=16,
out_channels=32,
kernel_size=4,
stride=2,
padding=1
)
self.fc1 = nn.Linear(
in_features=3200,
out_features=256,
)
self.fc2 = nn.Linear(
in_features=256,
out_features=n_actions,
)
# Activation Functions
self.relu = nn.ReLU()
def flatten(self, x):
batch_size = x.size()[0]
x = x.view(batch_size, -1)
return x
def forward(self, x):
# Forward pass
x = self.relu(self.conv1(x)) # In: (80, 80, 4) Out: (20, 20, 16)
x = self.relu(self.conv2(x)) # In: (20, 20, 16) Out: (10, 10, 32)
x = self.flatten(x) # In: (10, 10, 32) Out: (3200,)
x = self.relu(self.fc1(x)) # In: (3200,) Out: (256,)
x = self.fc2(x) # In: (256,) Out: (4,)
return x
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
def choose_action(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
# 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
# 优化更新模型
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -1,5 +1,7 @@
import sys
import os
import torch.nn as nn
import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
@@ -8,26 +10,42 @@ import gym
import torch
import datetime
import numpy as np
from common.utils import save_results, make_dir
from common.utils import save_results_1, make_dir
from common.utils import plot_rewards
from DQN.dqn import DQN
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
############################### hyperparameters ################################
self.algo_name = 'DQN' # algorithm name
self.env_name = 'CartPole-v0' # environment name
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数
self.test_eps = 20 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
@@ -41,8 +59,8 @@ class Config:
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ##############################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
model = MLP(n_states,n_actions)
agent = DQN(n_actions, model, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step += 1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
print('完成训练!')
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('Finish training!')
env.close()
return rewards, ma_rewards
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
return res_dic
def test(cfg, env, agent):
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
print('完成测试!')
env.close()
return rewards, ma_rewards
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
save_results_1(res_dic, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
res_dic = test(cfg, env, agent)
save_results_1(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@ Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17
LastEditor: JiangJi
LastEditTime: 2021-12-22 11:40:44
LastEditTime: 2022-02-10 06:17:41
Discription: 使用 Nature DQN 训练 CartPole-v1
'''
import sys
@@ -19,7 +19,7 @@ import torch
import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn
from DQN.dqn import DQN
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = "DQN" # 算法名称
@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):

View File

@@ -5,7 +5,7 @@ Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17
LastEditor: JiangJi
LastEditTime: 2021-12-22 15:27:48
LastEditTime: 2022-02-10 06:17:46
Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4
'''
import sys
@@ -20,7 +20,7 @@ import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn
from common.atari_wrappers import make_atari, wrap_deepmind
from DQN.dqn import DQN
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DQN-cnn' # 算法名称
@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
# env = wrap_deepmind(env)
# env = wrap_pytorch(env)
env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):

180
codes/DQN/task4.py Normal file
View File

@@ -0,0 +1,180 @@
import sys
import os
import torch.nn as nn
import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results_1, make_dir
from common.utils import plot_rewards
from dqn_1 import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=256):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
return self.fc4(x)
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
# self.env_name = 'Breakout-ram-v0' # 环境名称
self.env_name = 'ALE/Pong-ram-v5'
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 5 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500000 # e-greedy策略中epsilon的衰减率
self.lr = 0.00025 # 学习率
self.memory_capacity = int(5e4) # 经验回放的容量
self.batch_size = 32 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
model = MLP(n_states,n_actions)
agent = DQN(n_states, n_actions, model, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
ep_step = 0
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!')
env.close()
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
return res_dic
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
env.close()
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results_1(res_dic, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results_1(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果

149
codes/DQN/task5.py Normal file
View File

@@ -0,0 +1,149 @@
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
self.env_name = 'SpaceInvaders-ram-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 20000 # e-greedy策略中epsilon的衰减率
self.lr = 2e-4 # 学习率
self.memory_capacity = int(1e5) # 经验回放的容量
self.batch_size = 32 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
agent = DQN(n_states, n_actions, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!')
env.close()
return rewards, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

184
codes/DQN/test copy.py Normal file
View File

@@ -0,0 +1,184 @@
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import gym
import time
from collections import deque
from tensorflow.keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
import matplotlib.pyplot as plt
class DQN:
def __init__(self, env):
self.env = env
self.memory = deque(maxlen=400000)
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = self.epsilon_min / 500000
self.batch_size = 32
self.train_start = 1000
self.state_size = self.env.observation_space.shape[0]*4
self.action_size = self.env.action_space.n
self.learning_rate = 0.00025
self.evaluation_model = self.create_model()
self.target_model = self.create_model()
def create_model(self):
model = Sequential()
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
model.add(Dense(128*2, activation='relu'))
model.add(Dense(128*2, activation='relu'))
model.add(Dense(self.env.action_space.n, activation='linear'))
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
return model
def choose_action(self, state, steps):
if steps > 50000:
if self.epsilon > self.epsilon_min:
self.epsilon -= self.epsilon_decay
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
return np.argmax(self.evaluation_model.predict(state)[0])
def remember(self, cur_state, action, reward, new_state, done):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = (cur_state, action, reward, new_state, done)
self.memory.extend([transition])
self.memory_counter += 1
def replay(self):
if len(self.memory) < self.train_start:
return
mini_batch = random.sample(self.memory, self.batch_size)
update_input = np.zeros((self.batch_size, self.state_size))
update_target = np.zeros((self.batch_size, self.action_size))
for i in range(self.batch_size):
state, action, reward, new_state, done = mini_batch[i]
target = self.evaluation_model.predict(state)[0]
if done:
target[action] = reward
else:
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
update_input[i] = state
update_target[i] = target
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
def target_train(self):
self.target_model.set_weights(self.evaluation_model.get_weights())
return
def visualize(self, reward, episode):
plt.plot(episode, reward, 'ob-')
plt.title('Average reward each 100 episode')
plt.ylabel('Reward')
plt.xlabel('Episodes')
plt.grid()
plt.show()
def transform(self,state):
if state.shape[1]==512:
return state
a=[np.binary_repr(x,width=8) for x in state[0]]
res=[]
for x in a:
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
res=[int(x,2) for x in res]
return np.array(res)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def main():
# env = gym.make('Breakout-ram-v0')
env = gym.make('Breakout-ram-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space.shape[0])
print(env.observation_space.high)
print(env.observation_space.low)
#print(env.observation_space.shape)
episodes = 5000
trial_len = 10000
tmp_reward=0
sum_rewards = 0
n_success = 0
total_steps = 0
graph_reward = []
graph_episodes = []
time_record = []
dqn_agent = DQN(env=env)
for i_episode in range(episodes):
start_time = time.time()
total_reward = 0
cur_state = env.reset().reshape(1,128)
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
i_step=0
for step in range(trial_len):
#env.render()
i_step+=1
action = dqn_agent.choose_action(cur_state, total_steps)
new_state, reward, done, _ = env.step(action)
new_state = new_state.reshape(1, 128)
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
total_reward += reward
sum_rewards += reward
tmp_reward += reward
if reward>0: #Testing whether it is good.
reward=1
dqn_agent.remember(cur_state, action, reward, new_state, done)
if total_steps > 10000:
if total_steps%4 == 0:
dqn_agent.replay()
if total_steps%5000 == 0:
dqn_agent.target_train()
cur_state = new_state
total_steps += 1
if done:
env.reset()
break
if (i_episode+1) % 100 == 0:
graph_reward.append(sum_rewards/100)
graph_episodes.append(i_episode+1)
sum_rewards = 0
print("Episode ",i_episode+1," Reward: ")
print(graph_reward[-1])
end_time = time.time()
time_record.append(end_time-start_time)
print("NOW in episode: " + str(i_episode))
print("Time cost: " + str(end_time-start_time))
print("Reward: ",tmp_reward)
print("Step:", i_step)
tmp_reward=0
print("Reward: ")
print(graph_reward)
print("Episode: ")
print(graph_episodes)
print("Average_time: ")
print(sum(time_record)/5000)
dqn_agent.visualize(graph_reward, graph_episodes)
if __name__ == '__main__':
main()

View File

@@ -90,15 +90,15 @@ class OUNoise(object):
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.action_dim = action_space.shape[0]
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.action_dim) * self.mu
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):

View File

@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境如下图它通过向左(动作=0
import gym
env = gym.make('CartPole-v0') # 建立环境
env.seed(1) # 随机种子
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
state = env.reset() # 初始化环境
print(f"状态维度:{state_dim},动作维度:{action_dim}")
print(f"状态维度:{n_states},动作维度:{n_actions}")
print(f"初始状态:{state}")
```
@@ -157,7 +157,7 @@ def choose_action(self, state):
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
```
可以看到跟Q学习算法其实是一样的都是用的$\epsilon-greedy$策略只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。

View File

@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
这里我们在程序中使用了一个装饰器重新定义环境但不影响对环境的理解感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可然后我们可以查看环境的状态和动作维度目
```python
state_dim = env.observation_space.n # 状态维度
action_dim = env.action_space.n # 动作维度
print(f"状态维度:{state_dim},动作维度:{action_dim}")
n_states = env.observation_space.n # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"状态维度:{n_states},动作维度:{n_actions}")
```
打印出来的结果如下:
@@ -72,9 +72,9 @@ print(state)
env = gym.make('CliffWalking-v0') # 定义环境
env = CliffWalkingWapper(env) # 装饰环境
env.seed(1) # 设置随机种子
state_dim = env.observation_space.n # 状态维度
action_dim = env.action_space.n # 动作维度
agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
n_states = env.observation_space.n # 状态维度
n_actions = env.action_space.n # 动作维度
agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境
@@ -126,7 +126,7 @@ def choose_action(self, state):
if np.random.uniform(0, 1) > self.epsilon:
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
else:
action = np.random.choice(self.action_dim) # 随机选择动作
action = np.random.choice(self.n_actions) # 随机选择动作
return action
```

View File

@@ -46,15 +46,15 @@ class ReplayBuffer:
return len(self.buffer)
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
state_dim: 输入的特征数即环境的状态维度
action_dim: 输出的动作维度
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -63,8 +63,8 @@ class MLP(nn.Module):
return self.fc3(x)
class DoubleDQN:
def __init__(self, state_dim, action_dim, cfg):
self.action_dim = action_dim # 总的动作个数
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DoubleDQN:
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net copy from policy_net
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(param.data)
@@ -103,7 +103,7 @@ class DoubleDQN:
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
def update(self):

View File

@@ -59,9 +59,9 @@ class Config:
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = DoubleDQN(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DoubleDQN(n_states,n_actions,cfg)
return env,agent
def train(cfg,env,agent):

View File

@@ -136,12 +136,12 @@
"outputs": [],
"source": [
"class DuelingNet(nn.Module):\n",
" def __init__(self, state_dim, action_dim,hidden_size=128):\n",
" def __init__(self, n_states, n_actions,hidden_size=128):\n",
" super(DuelingNet, self).__init__()\n",
" \n",
" # 隐藏层\n",
" self.hidden = nn.Sequential(\n",
" nn.Linear(state_dim, hidden_size),\n",
" nn.Linear(n_states, hidden_size),\n",
" nn.ReLU()\n",
" )\n",
" \n",
@@ -149,7 +149,7 @@
" self.advantage = nn.Sequential(\n",
" nn.Linear(hidden_size, hidden_size),\n",
" nn.ReLU(),\n",
" nn.Linear(hidden_size, action_dim)\n",
" nn.Linear(hidden_size, n_actions)\n",
" )\n",
" \n",
" # 价值函数\n",
@@ -192,7 +192,7 @@
],
"source": [
"class DuelingDQN:\n",
" def __init__(self,state_dim,action_dim,cfg) -> None:\n",
" def __init__(self,n_states,n_actions,cfg) -> None:\n",
" self.batch_size = cfg.batch_size\n",
" self.device = cfg.device\n",
" self.loss_history = [] # 记录loss的变化\n",
@@ -200,8 +200,8 @@
" self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
" (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
" math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
" self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
" target_param.data.copy_(param.data)\n",
" self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
@@ -214,7 +214,7 @@
" q_values = self.policy_net(state)\n",
" action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
" else:\n",
" action = random.randrange(self.action_dim)\n",
" action = random.randrange(self.n_actions)\n",
" return action\n",
" def update(self):\n",
" if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略\n",

View File

@@ -57,16 +57,16 @@ class MLP(nn.Module):
return self.fc3(x)
class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg):
self.state_dim = state_dim
self.action_dim = action_dim
def __init__(self,n_states,n_actions,cfg):
self.n_states = n_states
self.n_actions = n_actions
self.gamma = cfg.gamma
self.device = cfg.device
self.batch_size = cfg.batch_size
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
self.policy_net = MLP(2*n_states, n_actions,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(n_states, n_states,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
@@ -76,7 +76,7 @@ class HierarchicalDQN:
self.losses = []
self.meta_losses = []
def to_onehot(self,x):
oh = np.zeros(self.state_dim)
oh = np.zeros(self.n_states)
oh[x - 1] = 1.
return oh
def set_goal(self,state):
@@ -85,7 +85,7 @@ class HierarchicalDQN:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
goal = self.meta_policy_net(state).max(1)[1].item()
else:
goal = random.randrange(self.state_dim)
goal = random.randrange(self.n_states)
return goal
def choose_action(self,state):
self.frame_idx += 1
@@ -95,7 +95,7 @@ class HierarchicalDQN:
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.action_dim)
action = random.randrange(self.n_actions)
return action
def update(self):
self.update_policy()

View File

@@ -63,9 +63,9 @@ class PlotConfig:
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = HierarchicalDQN(n_states,n_actions,cfg)
return env,agent
if __name__ == "__main__":

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2020 John Jim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,7 +0,0 @@
## 记录笔者更新的日志
**2021.12.28-1**:将```task.py```中的两个Config类合并为一个并加以注释便于阅读从DQN算法开始更新
**2021.12.22-3**:将```agent.py```更改为对应的算法名称,便于区分如```dqn```与```dqn_cnn```的情况
**2021.12.22-2**:简化了代码结构,将原来的```train.py```和```task.py```等合并到```task.py```中
**2021.12.22-1**:简化了代码结构,将原来的```model.py```和```memory.py```等合并到```agent.py```中,```plot.py```的内容合并到```common.utils.py```中

View File

@@ -17,11 +17,11 @@ import dill
class FisrtVisitMC:
''' On-Policy First-Visit MC Control
'''
def __init__(self,action_dim,cfg):
self.action_dim = action_dim
def __init__(self,n_actions,cfg):
self.n_actions = n_actions
self.epsilon = cfg.epsilon
self.gamma = cfg.gamma
self.Q_table = defaultdict(lambda: np.zeros(action_dim))
self.Q_table = defaultdict(lambda: np.zeros(n_actions))
self.returns_sum = defaultdict(float) # sum of returns
self.returns_count = defaultdict(float)
@@ -29,11 +29,11 @@ class FisrtVisitMC:
''' e-greed policy '''
if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
else:
action = np.random.randint(0,self.action_dim)
action = np.random.randint(0,self.n_actions)
return action
def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition

View File

@@ -43,8 +43,8 @@ class MCConfig:
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
action_dim = 9
agent = FisrtVisitMC(action_dim, cfg)
n_actions = 9
agent = FisrtVisitMC(n_actions, cfg)
return env,agent
def train(cfg, env, agent):

View File

@@ -57,16 +57,16 @@ model就是actor和critic两个网络了
import torch.nn as nn
from torch.distributions.categorical import Categorical
class Actor(nn.Module):
def __init__(self,state_dim, action_dim,
def __init__(self,n_states, n_actions,
hidden_dim=256):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=-1)
)
def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
return dist
class Critic(nn.Module):
def __init__(self, state_dim,hidden_dim=256):
def __init__(self, n_states,hidden_dim=256):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
value = self.critic(state)
return value
```
这里Actor就是得到一个概率分布(Categorica也可以是别的分布可以搜索torch distributionsl)critc根据当前状态得到一个值这里的输入维度可以是```state_dim+action_dim```即将action信息也纳入critic网络中这样会更好一些感兴趣的小伙伴可以试试。
这里Actor就是得到一个概率分布(Categorica也可以是别的分布可以搜索torch distributionsl)critc根据当前状态得到一个值这里的输入维度可以是```n_states+n_actions```即将action信息也纳入critic网络中这样会更好一些感兴趣的小伙伴可以试试。
### PPO update
定义一个update函数主要实现伪代码中的第六步和第七步

View File

@@ -1,44 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:30:46
LastEditor: John
LastEditTime: 2021-09-26 22:00:07
Discription:
Environment:
'''
import numpy as np
class PPOMemory:
def __init__(self, batch_size):
self.states = []
self.probs = []
self.vals = []
self.actions = []
self.rewards = []
self.dones = []
self.batch_size = batch_size
def sample(self):
batch_step = np.arange(0, len(self.states), self.batch_size)
indices = np.arange(len(self.states), dtype=np.int64)
np.random.shuffle(indices)
batches = [indices[i:i+self.batch_size] for i in batch_step]
return np.array(self.states),np.array(self.actions),np.array(self.probs),\
np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
def push(self, state, action, probs, vals, reward, done):
self.states.append(state)
self.actions.append(action)
self.probs.append(probs)
self.vals.append(vals)
self.rewards.append(reward)
self.dones.append(done)
def clear(self):
self.states = []
self.probs = []
self.actions = []
self.rewards = []
self.dones = []
self.vals = []

View File

@@ -1,44 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:29:24
LastEditor: John
LastEditTime: 2021-04-08 22:36:43
Discription:
Environment:
'''
import torch.nn as nn
from torch.distributions.categorical import Categorical
class Actor(nn.Module):
def __init__(self,state_dim, action_dim,
hidden_dim):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=-1)
)
def forward(self, state):
dist = self.actor(state)
dist = Categorical(dist)
return dist
class Critic(nn.Module):
def __init__(self, state_dim,hidden_dim):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
def forward(self, state):
value = self.critic(state)
return value

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:17:42
LastEditor: John
LastEditTime: 2021-09-26 22:02:00
LastEditTime: 2021-12-31 19:38:33
Discription:
Environment:
'''
@@ -13,25 +13,89 @@ import os
import numpy as np
import torch
import torch.optim as optim
from PPO.model import Actor,Critic
from PPO.memory import PPOMemory
import torch.nn as nn
from torch.distributions.categorical import Categorical
class PPOMemory:
def __init__(self, batch_size):
self.states = []
self.probs = []
self.vals = []
self.actions = []
self.rewards = []
self.dones = []
self.batch_size = batch_size
def sample(self):
batch_step = np.arange(0, len(self.states), self.batch_size)
indices = np.arange(len(self.states), dtype=np.int64)
np.random.shuffle(indices)
batches = [indices[i:i+self.batch_size] for i in batch_step]
return np.array(self.states),np.array(self.actions),np.array(self.probs),\
np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
def push(self, state, action, probs, vals, reward, done):
self.states.append(state)
self.actions.append(action)
self.probs.append(probs)
self.vals.append(vals)
self.rewards.append(reward)
self.dones.append(done)
def clear(self):
self.states = []
self.probs = []
self.actions = []
self.rewards = []
self.dones = []
self.vals = []
class Actor(nn.Module):
def __init__(self,n_states, n_actions,
hidden_dim):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=-1)
)
def forward(self, state):
dist = self.actor(state)
dist = Categorical(dist)
return dist
class Critic(nn.Module):
def __init__(self, n_states,hidden_dim):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
def forward(self, state):
value = self.critic(state)
return value
class PPO:
def __init__(self, state_dim, action_dim,cfg):
def __init__(self, n_states, n_actions,cfg):
self.gamma = cfg.gamma
self.continuous = cfg.continuous
self.policy_clip = cfg.policy_clip
self.n_epochs = cfg.n_epochs
self.gae_lambda = cfg.gae_lambda
self.device = cfg.device
self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
self.memory = PPOMemory(cfg.batch_size)
self.loss = 0
def choose_action(self, state):
state = torch.tensor([state], dtype=torch.float).to(self.device)
state = np.array([state]) # 先转成数组再转tensor更高效
state = torch.tensor(state, dtype=torch.float).to(self.device)
dist = self.actor(state)
value = self.critic(state)
action = dist.sample()

View File

@@ -5,63 +5,127 @@ sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import numpy as np
import datetime
from common.plot import plot_rewards
from common.utils import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
from ppo2 import PPO
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig:
class Config:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
################################## 环境超参数 ###################################
self.algo_name = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数
self.test_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
################################################################################
################################## 算法超参数 ####################################
self.batch_size = 5 # mini-batch SGD中的批量大小
self.gamma = 0.95 # 强化学习中的折扣因子
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.actor_lr = 0.0003 # actor的学习率
self.critic_lr = 0.0003 # critic的学习率
self.gae_lambda = 0.95
self.policy_clip = 0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.update_fre = 20 # 策略更新频率
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
if cfg.continuous:
n_actions = env.action_space.shape[0] # 动作维度
else:
n_actions = env.action_space.n # 动作维度
agent = PPO(n_states, n_actions, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = 0
for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
steps += 1
ep_reward += reward
agent.memory.push(state, action, prob, val, reward, done)
if steps % cfg.update_fre == 0:
agent.update()
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('完成训练!')
return rewards,ma_rewards
cfg = PPOConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
ep_reward += reward
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.test_eps, ep_reward))
print('完成训练!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env,agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg)
agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,cfg,tag="test")

View File

@@ -6,10 +6,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.plot import plot_rewards
from common.utils import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
from ppo2 import PPO
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
@@ -45,9 +44,9 @@ class PlotConfig:
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = PPO(state_dim,action_dim,cfg)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = PPO(n_states,n_actions,cfg)
return env,agent

File diff suppressed because one or more lines are too long

View File

@@ -1,121 +0,0 @@
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = 0
for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
steps += 1
ep_reward += reward
agent.memory.push(state, action, prob, val, reward, done)
if steps % cfg.update_fre == 0:
agent.update()
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('完成训练!')
return rewards,ma_rewards
def eval(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
ep_reward += reward
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.test_eps, ep_reward))
print('完成训练!')
return rewards,ma_rewards
if __name__ == '__main__':
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.plot import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.test_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
return env,agent
cfg = PPOConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

View File

@@ -1,31 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 16:35:58
LastEditor: John
LastEditTime: 2021-12-21 23:21:26
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
''' 多层感知机
输入state维度
输出:概率
'''
def __init__(self,input_dim,hidden_dim = 36):
super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据input_dim, action_dim的情况来改变
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

View File

@@ -5,21 +5,41 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44
LastEditor: John
LastEditTime: 2021-10-16 00:43:52
LastEditTime: 2022-02-10 01:25:27
Discription:
Environment:
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Bernoulli
from torch.autograd import Variable
import numpy as np
from PolicyGradient.model import MLP
class MLP(nn.Module):
''' 多层感知机
输入state维度
输出概率
'''
def __init__(self,input_dim,hidden_dim = 36):
super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据input_dim, n_actions的情况来改变
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
class PolicyGradient:
def __init__(self, state_dim,cfg):
def __init__(self, n_states,cfg):
self.gamma = cfg.gamma
self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.batch_size = cfg.batch_size

View File

@@ -0,0 +1,152 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-02-10 06:13:21
Discription:
Environment:
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from itertools import count
from pg import PolicyGradient
from common.utils import save_results, make_dir
from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = "PolicyGradient" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 300 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.batch_size = 8 # mini-batch SGD中的批量大小
self.lr = 0.01 # 学习率
self.gamma = 0.99 # 强化学习中的折扣因子
self.hidden_dim = 36 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
n_states = env.observation_space.shape[0]
agent = PolicyGradient(n_states,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
state_pool = [] # 存放每batch_size个episode的state序列
action_pool = []
reward_pool = []
rewards = []
ma_rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state_pool.append(state)
action_pool.append(float(action))
reward_pool.append(reward)
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
break
if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool)
state_pool = [] # 每个episode的state
action_pool = []
reward_pool = []
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
env.close()
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

View File

@@ -1,136 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2021-10-16 00:34:13
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加父路径到系统路径sys.path
import gym
import torch
import datetime
from itertools import count
from PolicyGradient.agent import PolicyGradient
from common.plot import plot_rewards
from common.utils import save_results,make_dir
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PGConfig:
def __init__(self):
self.algo = "PolicyGradient" # 算法名称
self.env = 'CartPole-v0' # 环境名称
self.result_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/models/' # 保存模型的路径
self.train_eps = 300 # 训练的回合数
self.test_eps = 30 # 测试的回合数
self.batch_size = 8
self.lr = 0.01 # 学习率
self.gamma = 0.99
self.hidden_dim = 36 # dimmension of hidden layer
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check gpu
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env)
env.seed(seed)
state_dim = env.observation_space.shape[0]
agent = PolicyGradient(state_dim,cfg)
return env,agent
def train(cfg,env,agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
state_pool = [] # 存放每batch_size个episode的state序列
action_pool = []
reward_pool = []
rewards = []
ma_rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state_pool.append(state)
action_pool.append(float(action))
reward_pool.append(reward)
state = next_state
if done:
print('Episode:', i_ep, ' Reward:', ep_reward)
break
if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool)
state_pool = [] # 每个episode的state
action_pool = []
reward_pool = []
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('complete training')
return rewards, ma_rewards
def eval(cfg,env,agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print('Episode:', i_ep, ' Reward:', ep_reward)
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('complete evaling')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = PGConfig()
# train
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=cfg.result_path)
# eval
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Some files were not shown because too many files have changed in this diff Show More