This commit is contained in:
JohnJim0816
2020-09-08 13:36:26 +08:00
parent f0d19ac14f
commit 106cfcc714
10 changed files with 108 additions and 83 deletions

View File

@@ -5,12 +5,16 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
@LastEditTime: 2020-06-14 13:56:45 LastEditTime: 2020-08-22 15:44:31
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
'''off-policy '''off-policy
''' '''
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
@@ -20,79 +24,97 @@ import math
import numpy as np import numpy as np
from memory import ReplayBuffer from memory import ReplayBuffer
from model import FCN from model import FCN
class DQN: class DQN:
def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01,batch_size=128, device="cpu"): def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"):
self.actions_count = 0 self.actions_count = 0
self.n_actions = n_actions self.n_actions = n_actions # 总的动作个数
self.device = device self.device = device # 设备cpu或gpu等
self.gamma = gamma self.gamma = gamma
# e-greedy策略相关参数
self.epsilon = 0 self.epsilon = 0
self.epsilon_start = epsilon_start self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay self.epsilon_decay = epsilon_decay
self.batch_size = batch_size self.batch_size = batch_size
self.policy_net = FCN(n_states,n_actions).to(self.device) self.policy_net = FCN(n_states, n_actions).to(self.device)
self.target_net = FCN(n_states,n_actions).to(self.device) self.target_net = FCN(n_states, n_actions).to(self.device)
# target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=policy_lr) # 可查parameters()与state_dict()的区别前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
self.loss = 0 self.loss = 0
self.memory = ReplayBuffer(memory_capacity) self.memory = ReplayBuffer(memory_capacity)
def select_action(self,state): def select_action(self, state):
'''选择 '''选择
Args: Args:
state [array]: 状态 state [array]: [description]
Returns: Returns:
[array]: 动作 action [array]: [description]
''' '''
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay) math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1 self.actions_count += 1
if random.random() > self.epsilon: if random.random() > self.epsilon:
with torch.no_grad(): with torch.no_grad():
state = torch.tensor([state],device=self.device,dtype=torch.float32) # 先转为张量便于丢给神经网络,state元素数据原本为float64注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 # 先转为张量便于丢给神经网络,state元素数据原本为float64
q_value = self.policy_net(state) # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
action = q_value.max(1)[1].item() state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else: else:
action = random.randrange(self.n_actions) action = random.randrange(self.n_actions)
return action return action
def update(self): def update(self):
if len(self.memory) < self.batch_size: if len(self.memory) < self.batch_size:
return return
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
# 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
1) # 例如tensor([[1],...,[0]])
reward_batch = torch.tensor(
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size) # 计算当前(s_t,a)对应的Q(s_t, a)
# 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]])
state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float) # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) # 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])
action_batch = torch.tensor(action_batch,device=self.device).unsqueeze(1) # 例如tensor([[1],...,[0]]) q_values = self.policy_net(state_batch).gather(
reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float) # tensor([1., 1.,...,1]) dim=1, index=action_batch) # 等价于self.forward
next_state_batch = torch.tensor(next_state_batch,device=self.device,dtype=torch.float) # 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states
done_batch = torch.tensor(np.float32(done_batch),device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
q_values = self.policy_net(state_batch).gather(1, action_batch) # 等价于self.forward
# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = self.target_net( next_state_values = self.target_net(
next_state_batch).max(1)[0].detach() # tensor([ 0.0060, -0.0171,...,]) next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# Compute the expected Q values # 计算 expected_q_value
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0]) # 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * \
# Compute Huber loss next_state_values * (1-done_batch[0])
# self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1)) # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
self.loss = nn.MSELoss()(q_values,expected_q_values.unsqueeze(1)) self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss
# Optimize the model # 优化模型
self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise youd just accumulate the gradients from all loss.backward() calls). self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
for param in self.policy_net.parameters(): # clip防止梯度爆炸 self.loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1) param.grad.data.clamp_(-1, 1)
self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters. self.optimizer.step() # 更新模型
def save_model():
pass
def load_model():
pass

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57 @Date: 2020-06-12 00:48:57
@LastEditor: John @LastEditor: John
@LastEditTime: 2020-07-20 23:02:16 LastEditTime: 2020-08-22 18:02:56
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -14,26 +14,27 @@ import torch
from dqn import DQN from dqn import DQN
from plot import plot from plot import plot
import argparse import argparse
def get_args(): def get_args():
'''模型建立好之后只需要在这里调 '''模型参
''' '''
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.99, parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95, parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.01, type=float) parser.add_argument("--epsilon_end", default=0.01, type=float)
parser.add_argument("--epsilon_decay", default=200, type=float) parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float) parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000, parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory") type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int, parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling") help="batch size of memory sampling")
parser.add_argument("--max_episodes", default=200, type=int) parser.add_argument("--max_episodes", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--max_steps", default=200, type=int) parser.add_argument("--max_steps", default=200, type=int)
parser.add_argument("--target_update", default=2, type=int, # 将目标网络的更新频率改为1就是普通的dqn大于1就是double dqn
parser.add_argument("--target_update", default=1, type=int,
help="when(every default 10 eisodes) to update target net ") help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args() config = parser.parse_args()
@@ -44,38 +45,34 @@ if __name__ == "__main__":
cfg = get_args() cfg = get_args()
# if gpu is to be used # if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0] n_states = env.observation_space.shape[0]
n_actions = env.action_space.n n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay,policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size) epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
rewards = [] rewards = []
moving_average_rewards = [] moving_average_rewards = []
ep_steps = []
for i_episode in range(1, cfg.max_episodes+1): for i_episode in range(1, cfg.max_episodes+1):
# Initialize the environment and state state = env.reset() # reset环境状态
state = env.reset()
ep_reward = 0 ep_reward = 0
for t in range(1, cfg.max_steps+1): for i_step in range(1, cfg.max_steps+1):
# Select and perform an action action = agent.select_action(state) # 根据当前环境state选择action
action = agent.select_action(state) next_state, reward, done, _ = env.step(action) # 更新环境参数
next_state, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
# Store the transition in memory agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
agent.memory.push(state,action,reward,next_state,done) state = next_state # 跳转到下一个状态
# Move to the next state agent.update() # 每步更新网络
state = next_state
# Perform one step of the optimization (on the target network)
agent.update()
if done: if done:
break break
# 更新target network复制DQN中的所有weights and biases
# Update the target network, copying all weights and biases in DQN
if i_episode % cfg.target_update == 0: if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict()) agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:', i_episode, ' Reward: %i' % print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'Explore: %.2f' % agent.epsilon) int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
ep_steps.append(i_step)
rewards.append(ep_reward) rewards.append(ep_reward)
# 计算滑动窗口的reward # 计算滑动窗口的reward
if i_episode == 1: if i_episode == 1:
@@ -83,14 +80,17 @@ if __name__ == "__main__":
else: else:
moving_average_rewards.append( moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward) 0.9*moving_average_rewards[-1]+0.1*ep_reward)
# 存储reward等相关结果
import os import os
import numpy as np import numpy as np
output_path = os.path.dirname(__file__)+"/result/" output_path = os.path.dirname(__file__)+"/result/"
# 检测是否存在文件夹
if not os.path.exists(output_path): if not os.path.exists(output_path):
os.mkdir(output_path) os.mkdir(output_path)
np.save(output_path+"rewards.npy", rewards) np.save(output_path+"rewards.npy", rewards)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards) np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
np.save(output_path+"steps.npy", ep_steps)
print('Complete') print('Complete')
plot(rewards) plot(rewards)
plot(moving_average_rewards, ylabel="moving_average_rewards") plot(moving_average_rewards, ylabel="moving_average_rewards")
plot(ep_steps, ylabel="steps_of_each_episode")

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:47:02 @Date: 2020-06-12 00:47:02
@LastEditor: John @LastEditor: John
@LastEditTime: 2020-06-14 11:23:04 LastEditTime: 2020-08-19 16:55:54
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -14,17 +14,17 @@ import torch.nn.functional as F
class FCN(nn.Module): class FCN(nn.Module):
def __init__(self, n_states=4, n_actions=18): def __init__(self, n_states=4, n_actions=18):
""" """ 初始化q网络为全连接网络
Initialize a deep Q-learning network for testing algorithm n_states: 输入的feature即环境的state数目
n_states: number of features of input. n_actions: 输出的action总个数
n_actions: number of action-value to output, one-to-one correspondence to action in game.
""" """
super(FCN, self).__init__() super(FCN, self).__init__()
self.fc1 = nn.Linear(n_states, 128) self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) self.fc3 = nn.Linear(128, n_actions) # 输出层
def forward(self, x): def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x)) x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x)) x = F.relu(self.fc2(x))
return self.fc3(x) return self.fc3(x)

View File

@@ -5,19 +5,22 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09 @Date: 2020-06-11 16:30:09
@LastEditor: John @LastEditor: John
@LastEditTime: 2020-06-14 11:38:42 LastEditTime: 2020-08-20 16:34:34
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np import numpy as np
import os import os
def plot(item,ylabel='rewards'): def plot(item,ylabel='rewards'):
sns.set()
plt.figure() plt.figure()
plt.plot(np.arange(len(item)), item) plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DQN') plt.title(ylabel+' of DQN')
plt.ylabel('rewards') plt.ylabel(ylabel)
plt.xlabel('episodes') plt.xlabel('episodes')
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show() plt.show()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

After

Width:  |  Height:  |  Size: 46 KiB

BIN
codes/dqn/result/steps.npy Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB