update rainbowdqn
This commit is contained in:
@@ -50,15 +50,15 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
class FCN(nn.Module):
|
||||
def __init__(self, state_dim=4, action_dim=18):
|
||||
def __init__(self, n_states=4, n_actions=18):
|
||||
""" 初始化q网络,为全连接网络
|
||||
state_dim: 输入的feature即环境的state数目
|
||||
action_dim: 输出的action总个数
|
||||
n_states: 输入的feature即环境的state数目
|
||||
n_actions: 输出的action总个数
|
||||
"""
|
||||
super(FCN, self).__init__()
|
||||
self.fc1 = nn.Linear(state_dim, 128) # 输入层
|
||||
self.fc1 = nn.Linear(n_states, 128) # 输入层
|
||||
self.fc2 = nn.Linear(128, 128) # 隐藏层
|
||||
self.fc3 = nn.Linear(128, action_dim) # 输出层
|
||||
self.fc3 = nn.Linear(128, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
@@ -66,7 +66,7 @@ class FCN(nn.Module):
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
```
|
||||
输入为state_dim,输出为action_dim,包含一个128维度的隐藏层,这里根据需要可增加隐藏层维度和数量,然后一般使用relu激活函数,这里跟深度学习的网路设置是一样的。
|
||||
输入为n_states,输出为n_actions,包含一个128维度的隐藏层,这里根据需要可增加隐藏层维度和数量,然后一般使用relu激活函数,这里跟深度学习的网路设置是一样的。
|
||||
|
||||
### Replay Buffer
|
||||
|
||||
@@ -107,8 +107,8 @@ class ReplayBuffer:
|
||||
在类中建立两个网络,以及optimizer和memory,
|
||||
|
||||
```python
|
||||
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
||||
@@ -124,7 +124,7 @@ def choose_action(self, state):
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
action = self.predict(state)
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
```
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2021-12-22 14:01:37
|
||||
LastEditTime: 2022-03-02 11:05:11
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -20,22 +20,7 @@ import random
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, state_dim,action_dim,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
state_dim: 输入的特征数即环境的状态维度
|
||||
action_dim: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
@@ -62,9 +47,9 @@ class ReplayBuffer:
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, state_dim, action_dim, cfg):
|
||||
def __init__(self, n_actions,model,cfg):
|
||||
|
||||
self.action_dim = action_dim # 总的动作个数
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
@@ -73,8 +58,8 @@ class DQN:
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = model.to(self.device)
|
||||
self.target_net = model.to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
@@ -86,23 +71,24 @@ class DQN:
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
return
|
||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
||||
# print('updating')
|
||||
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
# 转为张量
|
||||
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
|
||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
|
||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
|
||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
|
||||
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
|
||||
|
||||
@@ -70,9 +70,9 @@ class ReplayBuffer:
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, state_dim, action_dim, cfg):
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
|
||||
self.action_dim = action_dim # 总的动作个数
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
@@ -81,8 +81,8 @@ class DQN:
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = CNN(state_dim, action_dim).to(self.device)
|
||||
self.target_net = CNN(state_dim, action_dim).to(self.device)
|
||||
self.policy_net = CNN(n_states, n_actions).to(self.device)
|
||||
self.target_net = CNN(n_states, n_actions).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
@@ -94,11 +94,12 @@ class DQN:
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
with torch.no_grad():
|
||||
print(type(state))
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
|
||||
142
codes/DQN/dqn_cnn2.py
Normal file
142
codes/DQN/dqn_cnn2.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.autograd as autograd
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
class CNN(nn.Module):
|
||||
def __init__(self, n_frames, n_actions):
|
||||
super(CNN,self).__init__()
|
||||
self.n_frames = n_frames
|
||||
self.n_actions = n_actions
|
||||
|
||||
# Layers
|
||||
self.conv1 = nn.Conv2d(
|
||||
in_channels=n_frames,
|
||||
out_channels=16,
|
||||
kernel_size=8,
|
||||
stride=4,
|
||||
padding=2
|
||||
)
|
||||
self.conv2 = nn.Conv2d(
|
||||
in_channels=16,
|
||||
out_channels=32,
|
||||
kernel_size=4,
|
||||
stride=2,
|
||||
padding=1
|
||||
)
|
||||
self.fc1 = nn.Linear(
|
||||
in_features=3200,
|
||||
out_features=256,
|
||||
)
|
||||
self.fc2 = nn.Linear(
|
||||
in_features=256,
|
||||
out_features=n_actions,
|
||||
)
|
||||
|
||||
# Activation Functions
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def flatten(self, x):
|
||||
batch_size = x.size()[0]
|
||||
x = x.view(batch_size, -1)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
# Forward pass
|
||||
x = self.relu(self.conv1(x)) # In: (80, 80, 4) Out: (20, 20, 16)
|
||||
x = self.relu(self.conv2(x)) # In: (20, 20, 16) Out: (10, 10, 32)
|
||||
x = self.flatten(x) # In: (10, 10, 32) Out: (3200,)
|
||||
x = self.relu(self.fc1(x)) # In: (3200,) Out: (256,)
|
||||
x = self.fc2(x) # In: (256,) Out: (4,)
|
||||
|
||||
return x
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
self.frame_idx = 0 # 用于epsilon的衰减计数
|
||||
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = CNN(n_states, n_actions).to(self.device)
|
||||
self.target_net = CNN(n_states, n_actions).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
|
||||
|
||||
def choose_action(self, state):
|
||||
''' 选择动作
|
||||
'''
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
return
|
||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
# 转为张量
|
||||
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
|
||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
|
||||
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
|
||||
# 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
|
||||
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
|
||||
# 优化更新模型
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
|
||||
|
||||
def load(self, path):
|
||||
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 28 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 41 KiB |
Binary file not shown.
@@ -1,5 +1,7 @@
|
||||
import sys
|
||||
import os
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
@@ -8,26 +10,42 @@ import gym
|
||||
import torch
|
||||
import datetime
|
||||
import numpy as np
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import save_results_1, make_dir
|
||||
from common.utils import plot_rewards
|
||||
from DQN.dqn import DQN
|
||||
from dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
n_states: 输入的特征数即环境的状态维度
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
|
||||
class Config:
|
||||
'''超参数
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
################################## 环境超参数 ###################################
|
||||
self.algo_name = 'DQN' # 算法名称
|
||||
self.env_name = 'CartPole-v0' # 环境名称
|
||||
############################### hyperparameters ################################
|
||||
self.algo_name = 'DQN' # algorithm name
|
||||
self.env_name = 'CartPole-v0' # environment name
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
|
||||
self.seed = 10 # 随机种子,置0则不设置随机种子
|
||||
self.train_eps = 200 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
self.test_eps = 20 # 测试的回合数
|
||||
################################################################################
|
||||
|
||||
################################## 算法超参数 ###################################
|
||||
@@ -41,8 +59,8 @@ class Config:
|
||||
self.target_update = 4 # 目标网络的更新频率
|
||||
self.hidden_dim = 256 # 网络隐藏层
|
||||
################################################################################
|
||||
|
||||
################################# 保存结果相关参数 ##############################
|
||||
|
||||
################################# 保存结果相关参数 ################################
|
||||
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/results/' # 保存结果的路径
|
||||
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
|
||||
''' 创建环境和智能体
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
state_dim = env.observation_space.shape[0] # 状态维度
|
||||
action_dim = env.action_space.n # 动作维度
|
||||
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
print(f"n states: {n_states}, n actions: {n_actions}")
|
||||
model = MLP(n_states,n_actions)
|
||||
agent = DQN(n_actions, model, cfg) # 创建智能体
|
||||
if cfg.seed !=0: # 设置随机种子
|
||||
torch.manual_seed(cfg.seed)
|
||||
env.seed(cfg.seed)
|
||||
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
steps = []
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
ep_step = 0
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
ep_step += 1
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward,
|
||||
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
|
||||
break
|
||||
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 10 == 0:
|
||||
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
|
||||
print('完成训练!')
|
||||
if (i_ep + 1) % 1 == 0:
|
||||
print(f'Episode:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
|
||||
print('Finish training!')
|
||||
env.close()
|
||||
return rewards, ma_rewards
|
||||
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
|
||||
def test(cfg, env, agent):
|
||||
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
|
||||
################################################################################
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
steps = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
ep_step = 0
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
ep_step+=1
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
|
||||
print('完成测试!')
|
||||
env.close()
|
||||
return rewards, ma_rewards
|
||||
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
save_results_1(res_dic, tag='train',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test',
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results_1(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-12-22 11:14:17
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-12-22 11:40:44
|
||||
LastEditTime: 2022-02-10 06:17:41
|
||||
Discription: 使用 Nature DQN 训练 CartPole-v1
|
||||
'''
|
||||
import sys
|
||||
@@ -19,7 +19,7 @@ import torch
|
||||
import datetime
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards, plot_rewards_cn
|
||||
from DQN.dqn import DQN
|
||||
from dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
algo_name = "DQN" # 算法名称
|
||||
@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
env.seed(seed) # 设置随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态维度
|
||||
action_dim = env.action_space.n # 动作维度
|
||||
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-12-22 11:14:17
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2021-12-22 15:27:48
|
||||
LastEditTime: 2022-02-10 06:17:46
|
||||
Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4
|
||||
'''
|
||||
import sys
|
||||
@@ -20,7 +20,7 @@ import datetime
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards, plot_rewards_cn
|
||||
from common.atari_wrappers import make_atari, wrap_deepmind
|
||||
from DQN.dqn import DQN
|
||||
from dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
algo_name = 'DQN-cnn' # 算法名称
|
||||
@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
|
||||
# env = wrap_deepmind(env)
|
||||
# env = wrap_pytorch(env)
|
||||
env.seed(seed) # 设置随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态维度
|
||||
action_dim = env.action_space.n # 动作维度
|
||||
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
|
||||
180
codes/DQN/task4.py
Normal file
180
codes/DQN/task4.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import sys
|
||||
import os
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import numpy as np
|
||||
from common.utils import save_results_1, make_dir
|
||||
from common.utils import plot_rewards
|
||||
from dqn_1 import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=256):
|
||||
""" 初始化q网络,为全连接网络
|
||||
n_states: 输入的特征数即环境的状态维度
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = F.relu(self.fc3(x))
|
||||
return self.fc4(x)
|
||||
|
||||
class Config:
|
||||
'''超参数
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
################################## 环境超参数 ###################################
|
||||
self.algo_name = 'DQN' # 算法名称
|
||||
# self.env_name = 'Breakout-ram-v0' # 环境名称
|
||||
self.env_name = 'ALE/Pong-ram-v5'
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
|
||||
self.seed = 10 # 随机种子,置0则不设置随机种子
|
||||
self.train_eps = 5 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
################################################################################
|
||||
|
||||
################################## 算法超参数 ###################################
|
||||
self.gamma = 0.99 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||
self.epsilon_decay = 500000 # e-greedy策略中epsilon的衰减率
|
||||
self.lr = 0.00025 # 学习率
|
||||
self.memory_capacity = int(5e4) # 经验回放的容量
|
||||
self.batch_size = 32 # mini-batch SGD中的批量大小
|
||||
self.target_update = 4 # 目标网络的更新频率
|
||||
self.hidden_dim = 512 # 网络隐藏层
|
||||
################################################################################
|
||||
|
||||
################################# 保存结果相关参数 ################################
|
||||
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/results/' # 保存结果的路径
|
||||
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
################################################################################
|
||||
|
||||
|
||||
def env_agent_config(cfg):
|
||||
''' 创建环境和智能体
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
print(f"n states: {n_states}, n actions: {n_actions}")
|
||||
model = MLP(n_states,n_actions)
|
||||
agent = DQN(n_states, n_actions, model, cfg) # 创建智能体
|
||||
if cfg.seed !=0: # 设置随机种子
|
||||
torch.manual_seed(cfg.seed)
|
||||
env.seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
return env, agent
|
||||
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
steps = []
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
ep_step = 0
|
||||
while True:
|
||||
ep_step+=1
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # 保存transition
|
||||
state = next_state # 更新下一个状态
|
||||
agent.update() # 更新智能体
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 1 == 0:
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
|
||||
print('完成训练!')
|
||||
env.close()
|
||||
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
|
||||
def test(cfg, env, agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ###############
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
################################################################################
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
steps = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
ep_step = 0
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
ep_step+=1
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
env.close()
|
||||
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results_1(res_dic, tag='train',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results_1(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果
|
||||
149
codes/DQN/task5.py
Normal file
149
codes/DQN/task5.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import sys
|
||||
import os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import numpy as np
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards
|
||||
from dqn import DQN
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
|
||||
|
||||
class Config:
|
||||
'''超参数
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
################################## 环境超参数 ###################################
|
||||
self.algo_name = 'DQN' # 算法名称
|
||||
self.env_name = 'SpaceInvaders-ram-v0' # 环境名称
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
|
||||
self.seed = 10 # 随机种子,置0则不设置随机种子
|
||||
self.train_eps = 200 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
################################################################################
|
||||
|
||||
################################## 算法超参数 ###################################
|
||||
self.gamma = 0.99 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||
self.epsilon_decay = 20000 # e-greedy策略中epsilon的衰减率
|
||||
self.lr = 2e-4 # 学习率
|
||||
self.memory_capacity = int(1e5) # 经验回放的容量
|
||||
self.batch_size = 32 # mini-batch SGD中的批量大小
|
||||
self.target_update = 4 # 目标网络的更新频率
|
||||
self.hidden_dim = 512 # 网络隐藏层
|
||||
################################################################################
|
||||
|
||||
################################# 保存结果相关参数 ################################
|
||||
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/results/' # 保存结果的路径
|
||||
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
################################################################################
|
||||
|
||||
|
||||
def env_agent_config(cfg):
|
||||
''' 创建环境和智能体
|
||||
'''
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
print(f"n states: {n_states}, n actions: {n_actions}")
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
if cfg.seed !=0: # 设置随机种子
|
||||
torch.manual_seed(cfg.seed)
|
||||
env.seed(cfg.seed)
|
||||
np.random.seed(cfg.seed)
|
||||
return env, agent
|
||||
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # 保存transition
|
||||
state = next_state # 更新下一个状态
|
||||
agent.update() # 更新智能体
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 1 == 0:
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
|
||||
print('完成训练!')
|
||||
env.close()
|
||||
return rewards, ma_rewards
|
||||
|
||||
|
||||
def test(cfg, env, agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ###############
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
################################################################################
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
env.close()
|
||||
return rewards, ma_rewards
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
|
||||
184
codes/DQN/test copy.py
Normal file
184
codes/DQN/test copy.py
Normal file
@@ -0,0 +1,184 @@
|
||||
import random
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
import os
|
||||
import gym
|
||||
import time
|
||||
from collections import deque
|
||||
from tensorflow.keras import optimizers
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout
|
||||
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
class DQN:
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
self.memory = deque(maxlen=400000)
|
||||
self.gamma = 0.99
|
||||
self.epsilon = 1.0
|
||||
self.epsilon_min = 0.01
|
||||
self.epsilon_decay = self.epsilon_min / 500000
|
||||
|
||||
self.batch_size = 32
|
||||
self.train_start = 1000
|
||||
self.state_size = self.env.observation_space.shape[0]*4
|
||||
self.action_size = self.env.action_space.n
|
||||
self.learning_rate = 0.00025
|
||||
|
||||
self.evaluation_model = self.create_model()
|
||||
self.target_model = self.create_model()
|
||||
|
||||
def create_model(self):
|
||||
model = Sequential()
|
||||
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
|
||||
model.add(Dense(128*2, activation='relu'))
|
||||
model.add(Dense(128*2, activation='relu'))
|
||||
model.add(Dense(self.env.action_space.n, activation='linear'))
|
||||
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
|
||||
return model
|
||||
|
||||
def choose_action(self, state, steps):
|
||||
if steps > 50000:
|
||||
if self.epsilon > self.epsilon_min:
|
||||
self.epsilon -= self.epsilon_decay
|
||||
if np.random.random() < self.epsilon:
|
||||
return self.env.action_space.sample()
|
||||
return np.argmax(self.evaluation_model.predict(state)[0])
|
||||
|
||||
def remember(self, cur_state, action, reward, new_state, done):
|
||||
if not hasattr(self, 'memory_counter'):
|
||||
self.memory_counter = 0
|
||||
|
||||
transition = (cur_state, action, reward, new_state, done)
|
||||
self.memory.extend([transition])
|
||||
|
||||
self.memory_counter += 1
|
||||
|
||||
def replay(self):
|
||||
if len(self.memory) < self.train_start:
|
||||
return
|
||||
|
||||
mini_batch = random.sample(self.memory, self.batch_size)
|
||||
|
||||
update_input = np.zeros((self.batch_size, self.state_size))
|
||||
update_target = np.zeros((self.batch_size, self.action_size))
|
||||
|
||||
for i in range(self.batch_size):
|
||||
state, action, reward, new_state, done = mini_batch[i]
|
||||
target = self.evaluation_model.predict(state)[0]
|
||||
|
||||
if done:
|
||||
target[action] = reward
|
||||
else:
|
||||
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
|
||||
|
||||
update_input[i] = state
|
||||
update_target[i] = target
|
||||
|
||||
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
|
||||
|
||||
def target_train(self):
|
||||
self.target_model.set_weights(self.evaluation_model.get_weights())
|
||||
return
|
||||
|
||||
def visualize(self, reward, episode):
|
||||
plt.plot(episode, reward, 'ob-')
|
||||
plt.title('Average reward each 100 episode')
|
||||
plt.ylabel('Reward')
|
||||
plt.xlabel('Episodes')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
def transform(self,state):
|
||||
if state.shape[1]==512:
|
||||
return state
|
||||
a=[np.binary_repr(x,width=8) for x in state[0]]
|
||||
res=[]
|
||||
for x in a:
|
||||
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
|
||||
res=[int(x,2) for x in res]
|
||||
return np.array(res)
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
def main():
|
||||
# env = gym.make('Breakout-ram-v0')
|
||||
env = gym.make('Breakout-ram-v0')
|
||||
env = env.unwrapped
|
||||
|
||||
print(env.action_space)
|
||||
print(env.observation_space.shape[0])
|
||||
print(env.observation_space.high)
|
||||
print(env.observation_space.low)
|
||||
|
||||
#print(env.observation_space.shape)
|
||||
|
||||
|
||||
episodes = 5000
|
||||
trial_len = 10000
|
||||
|
||||
tmp_reward=0
|
||||
sum_rewards = 0
|
||||
n_success = 0
|
||||
total_steps = 0
|
||||
|
||||
graph_reward = []
|
||||
graph_episodes = []
|
||||
time_record = []
|
||||
|
||||
dqn_agent = DQN(env=env)
|
||||
for i_episode in range(episodes):
|
||||
start_time = time.time()
|
||||
total_reward = 0
|
||||
cur_state = env.reset().reshape(1,128)
|
||||
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
|
||||
i_step=0
|
||||
for step in range(trial_len):
|
||||
#env.render()
|
||||
i_step+=1
|
||||
action = dqn_agent.choose_action(cur_state, total_steps)
|
||||
new_state, reward, done, _ = env.step(action)
|
||||
new_state = new_state.reshape(1, 128)
|
||||
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
|
||||
total_reward += reward
|
||||
sum_rewards += reward
|
||||
tmp_reward += reward
|
||||
if reward>0: #Testing whether it is good.
|
||||
reward=1
|
||||
|
||||
dqn_agent.remember(cur_state, action, reward, new_state, done)
|
||||
if total_steps > 10000:
|
||||
if total_steps%4 == 0:
|
||||
dqn_agent.replay()
|
||||
if total_steps%5000 == 0:
|
||||
dqn_agent.target_train()
|
||||
|
||||
cur_state = new_state
|
||||
total_steps += 1
|
||||
if done:
|
||||
env.reset()
|
||||
break
|
||||
if (i_episode+1) % 100 == 0:
|
||||
graph_reward.append(sum_rewards/100)
|
||||
graph_episodes.append(i_episode+1)
|
||||
sum_rewards = 0
|
||||
print("Episode ",i_episode+1," Reward: ")
|
||||
print(graph_reward[-1])
|
||||
end_time = time.time()
|
||||
time_record.append(end_time-start_time)
|
||||
print("NOW in episode: " + str(i_episode))
|
||||
print("Time cost: " + str(end_time-start_time))
|
||||
print("Reward: ",tmp_reward)
|
||||
print("Step:", i_step)
|
||||
tmp_reward=0
|
||||
print("Reward: ")
|
||||
print(graph_reward)
|
||||
print("Episode: ")
|
||||
print(graph_episodes)
|
||||
print("Average_time: ")
|
||||
print(sum(time_record)/5000)
|
||||
dqn_agent.visualize(graph_reward, graph_episodes)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user