add double_dqn
This commit is contained in:
121
codes/double_dqn/dqn.py
Normal file
121
codes/double_dqn/dqn.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
@Author: John
|
||||||
|
@Email: johnjim0816@gmail.com
|
||||||
|
@Date: 2020-06-12 00:50:49
|
||||||
|
@LastEditor: John
|
||||||
|
LastEditTime: 2020-09-01 22:54:02
|
||||||
|
@Discription:
|
||||||
|
@Environment: python 3.7.7
|
||||||
|
'''
|
||||||
|
'''off-policy
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.optim as optim
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import random
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
from memory import ReplayBuffer
|
||||||
|
from model import FCN
|
||||||
|
class DQN:
|
||||||
|
def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"):
|
||||||
|
self.actions_count = 0
|
||||||
|
self.n_actions = n_actions # 总的动作个数
|
||||||
|
self.device = device # 设备,cpu或gpu等
|
||||||
|
self.gamma = gamma
|
||||||
|
# e-greedy策略相关参数
|
||||||
|
self.epsilon = 0
|
||||||
|
self.epsilon_start = epsilon_start
|
||||||
|
self.epsilon_end = epsilon_end
|
||||||
|
self.epsilon_decay = epsilon_decay
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.policy_net = FCN(n_states, n_actions).to(self.device)
|
||||||
|
self.target_net = FCN(n_states, n_actions).to(self.device)
|
||||||
|
# target_net的初始模型参数完全复制policy_net
|
||||||
|
self.target_net.load_state_dict(self.policy_net.state_dict())
|
||||||
|
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
||||||
|
# 可查parameters()与state_dict()的区别,前者require_grad=True
|
||||||
|
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
|
||||||
|
self.loss = 0
|
||||||
|
self.memory = ReplayBuffer(memory_capacity)
|
||||||
|
|
||||||
|
def select_action(self, state):
|
||||||
|
'''选择动作
|
||||||
|
Args:
|
||||||
|
state [array]: [description]
|
||||||
|
Returns:
|
||||||
|
action [array]: [description]
|
||||||
|
'''
|
||||||
|
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||||
|
math.exp(-1. * self.actions_count / self.epsilon_decay)
|
||||||
|
self.actions_count += 1
|
||||||
|
if random.random() > self.epsilon:
|
||||||
|
with torch.no_grad():
|
||||||
|
# 先转为张量便于丢给神经网络,state元素数据原本为float64
|
||||||
|
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
|
||||||
|
state = torch.tensor(
|
||||||
|
[state], device=self.device, dtype=torch.float32)
|
||||||
|
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
|
||||||
|
q_value = self.policy_net(state)
|
||||||
|
# tensor.max(1)返回每行的最大值以及对应的下标,
|
||||||
|
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
|
||||||
|
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
||||||
|
action = q_value.max(1)[1].item()
|
||||||
|
else:
|
||||||
|
action = random.randrange(self.n_actions)
|
||||||
|
return action
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
|
||||||
|
if len(self.memory) < self.batch_size:
|
||||||
|
return
|
||||||
|
# 从memory中随机采样transition
|
||||||
|
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||||
|
self.batch_size)
|
||||||
|
# 转为张量
|
||||||
|
# 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])
|
||||||
|
state_batch = torch.tensor(
|
||||||
|
state_batch, device=self.device, dtype=torch.float)
|
||||||
|
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
|
||||||
|
1) # 例如tensor([[1],...,[0]])
|
||||||
|
reward_batch = torch.tensor(
|
||||||
|
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
|
||||||
|
next_state_batch = torch.tensor(
|
||||||
|
next_state_batch, device=self.device, dtype=torch.float)
|
||||||
|
done_batch = torch.tensor(np.float32(
|
||||||
|
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
|
||||||
|
|
||||||
|
# 计算当前(s_t,a)对应的Q(s_t, a)
|
||||||
|
# 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]])
|
||||||
|
# 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])
|
||||||
|
q_values = self.policy_net(state_batch).gather(
|
||||||
|
dim=1, index=action_batch) # 等价于self.forward
|
||||||
|
# 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states
|
||||||
|
next_state_values = self.target_net(
|
||||||
|
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
|
||||||
|
# 计算 expected_q_value
|
||||||
|
# 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||||
|
expected_q_values = reward_batch + self.gamma * \
|
||||||
|
next_state_values * (1-done_batch[0])
|
||||||
|
# self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
|
||||||
|
self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss
|
||||||
|
# 优化模型
|
||||||
|
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
|
||||||
|
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
|
||||||
|
self.loss.backward()
|
||||||
|
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||||
|
param.grad.data.clamp_(-1, 1)
|
||||||
|
self.optimizer.step() # 更新模型
|
||||||
|
|
||||||
|
def save_model(self,path):
|
||||||
|
torch.save(self.target_net.state_dict(), path)
|
||||||
|
|
||||||
|
def load_model(self,path):
|
||||||
|
self.policy_net.load_state_dict(torch.load(path))
|
||||||
143
codes/double_dqn/main.py
Normal file
143
codes/double_dqn/main.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
@Author: John
|
||||||
|
@Email: johnjim0816@gmail.com
|
||||||
|
@Date: 2020-06-12 00:48:57
|
||||||
|
@LastEditor: John
|
||||||
|
LastEditTime: 2020-09-01 22:54:23
|
||||||
|
@Discription:
|
||||||
|
@Environment: python 3.7.7
|
||||||
|
'''
|
||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
from dqn import DQN
|
||||||
|
from plot import plot
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
'''模型参数
|
||||||
|
'''
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--gamma", default=0.99,
|
||||||
|
type=float) # q-learning中的gamma
|
||||||
|
parser.add_argument("--epsilon_start", default=0.95,
|
||||||
|
type=float) # 基于贪心选择action对应的参数epsilon
|
||||||
|
parser.add_argument("--epsilon_end", default=0.05, type=float)
|
||||||
|
parser.add_argument("--epsilon_decay", default=500, type=float)
|
||||||
|
parser.add_argument("--policy_lr", default=0.01, type=float)
|
||||||
|
parser.add_argument("--memory_capacity", default=1000,
|
||||||
|
type=int, help="capacity of Replay Memory")
|
||||||
|
|
||||||
|
parser.add_argument("--batch_size", default=32, type=int,
|
||||||
|
help="batch size of memory sampling")
|
||||||
|
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
|
||||||
|
parser.add_argument("--train_steps", default=200, type=int) # 训练每个episode的长度
|
||||||
|
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
|
||||||
|
parser.add_argument("--eval_steps", default=200, type=int) # 训练每个episode的长度
|
||||||
|
parser.add_argument("--target_update", default=2, type=int,
|
||||||
|
help="when(every default 10 eisodes) to update target net ")
|
||||||
|
config = parser.parse_args()
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
def train():
|
||||||
|
cfg = get_args()
|
||||||
|
# if gpu is to be used
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
|
||||||
|
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
|
||||||
|
env.seed(1) # 设置env随机种子
|
||||||
|
n_states = env.observation_space.shape[0]
|
||||||
|
n_actions = env.action_space.n
|
||||||
|
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
|
||||||
|
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
|
||||||
|
rewards = []
|
||||||
|
moving_average_rewards = []
|
||||||
|
ep_steps = []
|
||||||
|
for i_episode in range(1, cfg.train_eps+1):
|
||||||
|
state = env.reset() # reset环境状态
|
||||||
|
ep_reward = 0
|
||||||
|
for i_step in range(1, cfg.train_steps+1):
|
||||||
|
action = agent.select_action(state) # 根据当前环境state选择action
|
||||||
|
next_state, reward, done, _ = env.step(action) # 更新环境参数
|
||||||
|
ep_reward += reward
|
||||||
|
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
|
||||||
|
state = next_state # 跳转到下一个状态
|
||||||
|
agent.update() # 每步更新网络
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
# 更新target network,复制DQN中的所有weights and biases
|
||||||
|
if i_episode % cfg.target_update == 0:
|
||||||
|
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||||
|
print('Episode:', i_episode, ' Reward: %i' %
|
||||||
|
int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
|
||||||
|
ep_steps.append(i_step)
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
# 计算滑动窗口的reward
|
||||||
|
if i_episode == 1:
|
||||||
|
moving_average_rewards.append(ep_reward)
|
||||||
|
else:
|
||||||
|
moving_average_rewards.append(
|
||||||
|
0.9*moving_average_rewards[-1]+0.1*ep_reward)
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
save_path = os.path.dirname(__file__)+"/saved_model/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.mkdir(save_path)
|
||||||
|
agent.save_model(save_path+'checkpoint.pth')
|
||||||
|
# 存储reward等相关结果
|
||||||
|
output_path = os.path.dirname(__file__)+"/result/"
|
||||||
|
# 检测是否存在文件夹
|
||||||
|
if not os.path.exists(output_path):
|
||||||
|
os.mkdir(output_path)
|
||||||
|
np.save(output_path+"rewards.npy", rewards)
|
||||||
|
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
|
||||||
|
np.save(output_path+"steps.npy", ep_steps)
|
||||||
|
print('Complete!')
|
||||||
|
plot(rewards)
|
||||||
|
plot(moving_average_rewards, ylabel="moving_average_rewards")
|
||||||
|
plot(ep_steps, ylabel="steps_of_each_episode")
|
||||||
|
|
||||||
|
def eval():
|
||||||
|
cfg = get_args()
|
||||||
|
# if gpu is to be used
|
||||||
|
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要
|
||||||
|
env.seed(1) # 设置env随机种子
|
||||||
|
n_states = env.observation_space.shape[0]
|
||||||
|
n_actions = env.action_space.n
|
||||||
|
agent = DQN(n_states=n_states, n_actions=n_actions, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
|
||||||
|
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
|
||||||
|
import os
|
||||||
|
save_path = os.path.dirname(__file__)+"/saved_model/"
|
||||||
|
if not os.path.exists(save_path):
|
||||||
|
os.mkdir(save_path)
|
||||||
|
agent.load_model(save_path+'checkpoint.pth')
|
||||||
|
rewards = []
|
||||||
|
moving_average_rewards = []
|
||||||
|
ep_steps = []
|
||||||
|
for i_episode in range(1, cfg.eval_eps+1):
|
||||||
|
state = env.reset() # reset环境状态
|
||||||
|
ep_reward = 0
|
||||||
|
for i_step in range(1, cfg.eval_steps+1):
|
||||||
|
action = agent.select_action(state) # 根据当前环境state选择action
|
||||||
|
next_state, reward, done, _ = env.step(action) # 更新环境参数
|
||||||
|
ep_reward += reward
|
||||||
|
state = next_state # 跳转到下一个状态
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
print('Episode:', i_episode, ' Reward: %i' %
|
||||||
|
int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
|
||||||
|
ep_steps.append(i_step)
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
# 计算滑动窗口的reward
|
||||||
|
if i_episode == 1:
|
||||||
|
moving_average_rewards.append(ep_reward)
|
||||||
|
else:
|
||||||
|
moving_average_rewards.append(
|
||||||
|
0.9*moving_average_rewards[-1]+0.1*ep_reward)
|
||||||
|
plot(rewards,save_fig=False)
|
||||||
|
plot(moving_average_rewards, ylabel="moving_average_rewards",save_fig=False)
|
||||||
|
plot(ep_steps, ylabel="steps_of_each_episode",save_fig=False)
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# train()
|
||||||
|
eval()
|
||||||
35
codes/double_dqn/memory.py
Normal file
35
codes/double_dqn/memory.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
@Author: John
|
||||||
|
@Email: johnjim0816@gmail.com
|
||||||
|
@Date: 2020-06-10 15:27:16
|
||||||
|
@LastEditor: John
|
||||||
|
@LastEditTime: 2020-06-14 11:36:24
|
||||||
|
@Discription:
|
||||||
|
@Environment: python 3.7.7
|
||||||
|
'''
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class ReplayBuffer:
|
||||||
|
|
||||||
|
def __init__(self, capacity):
|
||||||
|
self.capacity = capacity
|
||||||
|
self.buffer = []
|
||||||
|
self.position = 0
|
||||||
|
|
||||||
|
def push(self, state, action, reward, next_state, done):
|
||||||
|
if len(self.buffer) < self.capacity:
|
||||||
|
self.buffer.append(None)
|
||||||
|
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||||
|
self.position = (self.position + 1) % self.capacity
|
||||||
|
|
||||||
|
def sample(self, batch_size):
|
||||||
|
batch = random.sample(self.buffer, batch_size)
|
||||||
|
state, action, reward, next_state, done = zip(*batch)
|
||||||
|
return state, action, reward, next_state, done
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.buffer)
|
||||||
|
|
||||||
30
codes/double_dqn/model.py
Normal file
30
codes/double_dqn/model.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
@Author: John
|
||||||
|
@Email: johnjim0816@gmail.com
|
||||||
|
@Date: 2020-06-12 00:47:02
|
||||||
|
@LastEditor: John
|
||||||
|
LastEditTime: 2020-08-19 16:55:54
|
||||||
|
@Discription:
|
||||||
|
@Environment: python 3.7.7
|
||||||
|
'''
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
class FCN(nn.Module):
|
||||||
|
def __init__(self, n_states=4, n_actions=18):
|
||||||
|
""" 初始化q网络,为全连接网络
|
||||||
|
n_states: 输入的feature即环境的state数目
|
||||||
|
n_actions: 输出的action总个数
|
||||||
|
"""
|
||||||
|
super(FCN, self).__init__()
|
||||||
|
self.fc1 = nn.Linear(n_states, 128) # 输入层
|
||||||
|
self.fc2 = nn.Linear(128, 128) # 隐藏层
|
||||||
|
self.fc3 = nn.Linear(128, n_actions) # 输出层
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# 各层对应的激活函数
|
||||||
|
x = F.relu(self.fc1(x))
|
||||||
|
x = F.relu(self.fc2(x))
|
||||||
|
return self.fc3(x)
|
||||||
34
codes/double_dqn/plot.py
Normal file
34
codes/double_dqn/plot.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
@Author: John
|
||||||
|
@Email: johnjim0816@gmail.com
|
||||||
|
@Date: 2020-06-11 16:30:09
|
||||||
|
@LastEditor: John
|
||||||
|
LastEditTime: 2020-09-01 22:46:43
|
||||||
|
@Discription:
|
||||||
|
@Environment: python 3.7.7
|
||||||
|
'''
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
def plot(item,ylabel='rewards',save_fig = True):
|
||||||
|
sns.set()
|
||||||
|
plt.figure()
|
||||||
|
plt.plot(np.arange(len(item)), item)
|
||||||
|
plt.title(ylabel+' of DQN')
|
||||||
|
plt.ylabel(ylabel)
|
||||||
|
plt.xlabel('episodes')
|
||||||
|
if save_fig:
|
||||||
|
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
|
||||||
|
plt.show()
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
output_path = os.path.dirname(__file__)+"/result/"
|
||||||
|
rewards=np.load(output_path+"rewards.npy", )
|
||||||
|
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
|
||||||
|
plot(rewards)
|
||||||
|
plot(moving_average_rewards,ylabel='moving_average_rewards')
|
||||||
BIN
codes/double_dqn/result/moving_average_rewards.npy
Normal file
BIN
codes/double_dqn/result/moving_average_rewards.npy
Normal file
Binary file not shown.
BIN
codes/double_dqn/result/moving_average_rewards.png
Normal file
BIN
codes/double_dqn/result/moving_average_rewards.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 36 KiB |
BIN
codes/double_dqn/result/rewards.npy
Normal file
BIN
codes/double_dqn/result/rewards.npy
Normal file
Binary file not shown.
BIN
codes/double_dqn/result/rewards.png
Normal file
BIN
codes/double_dqn/result/rewards.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 48 KiB |
BIN
codes/double_dqn/result/steps.npy
Normal file
BIN
codes/double_dqn/result/steps.npy
Normal file
Binary file not shown.
BIN
codes/double_dqn/result/steps_of_each_episode.png
Normal file
BIN
codes/double_dqn/result/steps_of_each_episode.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 54 KiB |
BIN
codes/double_dqn/saved_model/checkpoint.pth
Normal file
BIN
codes/double_dqn/saved_model/checkpoint.pth
Normal file
Binary file not shown.
Reference in New Issue
Block a user