update DoubleDQN

This commit is contained in:
JohnJim0816
2020-12-22 16:07:09 +08:00
parent 09f983120a
commit ac1d4b0a74
54 changed files with 264 additions and 146 deletions

33
codes/DoubleDQN/README.md Normal file
View File

@@ -0,0 +1,33 @@
## 思路
见[博客](https://blog.csdn.net/JohnJim0/article/details/111552545)
## 环境
python 3.7.9
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
## 使用
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
可视化
```python
tensorboard --logdir logs
```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2020-09-01 22:54:02
LastEditTime: 2020-12-22 14:44:46
@Discription:
@Environment: python 3.7.7
'''
@@ -13,8 +13,6 @@ LastEditTime: 2020-09-01 22:54:02
'''
import torch
import torch.nn as nn
import torch.optim as optim
@@ -46,32 +44,41 @@ class DQN:
self.loss = 0
self.memory = ReplayBuffer(memory_capacity)
def select_action(self, state):
def choose_action(self, state, train=True):
'''选择动作
Args:
state [array]: [description]
Returns:
action [array]: [description]
'''
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1
if random.random() > self.epsilon:
if train:
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
else:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device='cpu', dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.target_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
return action
def update(self):
if len(self.memory) < self.batch_size:
@@ -93,19 +100,25 @@ class DQN:
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
# 计算当前(s_t,a)对应的Q(s_t, a)
# 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]])
# 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])
q_values = self.policy_net(state_batch).gather(
dim=1, index=action_batch) # 等价于self.forward
# 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states
next_state_values = self.target_net(
q_values = self.policy_net(state_batch)
next_q_values = self.policy_net(state_batch)
# 代入当前选择的action得到Q(s_t|a=a_t)
q_value = q_values.gather(dim=1, index=action_batch)
'''以下是Nature DQN的q_target计算方式
# 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 expected_q_value
# 计算 q_target
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * \
next_state_values * (1-done_batch[0])
# self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQNq_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
# 选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
# 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
@@ -113,9 +126,9 @@ class DQN:
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
def save_model(self,path):
torch.save(self.target_net.state_dict(), path)
def load_model(self,path):
self.policy_net.load_state_dict(torch.load(path))
self.target_net.load_state_dict(torch.load(path))

View File

@@ -5,45 +5,21 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2020-09-01 22:54:23
LastEditTime: 2020-12-22 15:39:46
@Discription:
@Environment: python 3.7.7
'''
import gym
import torch
from dqn import DQN
from plot import plot
import argparse
from torch.utils.tensorboard import SummaryWriter
import os
from agent import DQN
from params import SEQUENCE,SAVED_MODEL_PATH,RESULT_PATH
from params import get_args
from utils import save_results
def get_args():
'''模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.05, type=float)
parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--train_steps", default=200, type=int) # 训练每个episode的长度
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200, type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
def train():
cfg = get_args()
# if gpu is to be used
def train(cfg):
print('Start to train !')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
@@ -54,11 +30,13 @@ def train():
rewards = []
moving_average_rewards = []
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.train_eps+1):
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.train_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
@@ -79,54 +57,48 @@ def train():
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
import os
import numpy as np
save_path = os.path.dirname(__file__)+"/saved_model/"
if not os.path.exists(save_path):
os.mkdir(save_path)
agent.save_model(save_path+'checkpoint.pth')
# 存储reward等相关结果
output_path = os.path.dirname(__file__)+"/result/"
# 检测是否存在文件夹
if not os.path.exists(output_path):
os.mkdir(output_path)
np.save(output_path+"rewards.npy", rewards)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
np.save(output_path+"steps.npy", ep_steps)
print('Complete')
plot(rewards)
plot(moving_average_rewards, ylabel="moving_average_rewards")
plot(ep_steps, ylabel="steps_of_each_episode")
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
print('Complete training')
''' 保存模型 '''
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
print('model saved')
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
def eval():
cfg = get_args()
# if gpu is to be used
def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
print('start to eval !')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
import os
save_path = os.path.dirname(__file__)+"/saved_model/"
if not os.path.exists(save_path):
os.mkdir(save_path)
agent.load_model(save_path+'checkpoint.pth')
agent.load_model(saved_model_path+'checkpoint.pth')
rewards = []
moving_average_rewards = []
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.eval_eps+1):
state = env.reset() # reset环境状态
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.eval_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
action = agent.choose_action(state,train=False) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
state = next_state # 跳转到下一个状态
state = next_state # 跳转到下一个状态
if done:
break
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
int(ep_reward), 'n_steps:', i_step, 'done: ', done)
ep_steps.append(i_step)
rewards.append(ep_reward)
# 计算滑动窗口的reward
@@ -135,9 +107,20 @@ def eval():
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
plot(rewards,save_fig=False)
plot(moving_average_rewards, ylabel="moving_average_rewards",save_fig=False)
plot(ep_steps, ylabel="steps_of_each_episode",save_fig=False)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,ep_steps,tag='eval',result_path=RESULT_PATH)
print('Complete evaling')
if __name__ == "__main__":
# train()
eval()
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
@LastEditTime: 2020-06-14 11:36:24
LastEditTime: 2020-12-22 12:56:27
@Discription:
@Environment: python 3.7.7
'''
@@ -15,21 +15,27 @@ import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity
self.capacity = capacity # buffer的最大容量
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
'''以队列的方式将样本填入buffer中
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
'''随机采样batch_size个样本
'''
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = zip(*batch)
return state, action, reward, next_state, done
def __len__(self):
'''返回buffer的长度
'''
return len(self.buffer)

48
codes/DoubleDQN/params.py Normal file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-12-22 15:22:17
LastEditor: John
LastEditTime: 2020-12-22 15:26:09
Discription:
Environment:
'''
import datetime
import os
import argparse
ALGO_NAME = 'Double DQN'
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def get_args():
'''模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.01, type=float)
parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--train_steps", default=200, type=int)
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 2 eisodes) to update target net ") # 更新频率
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
config = parser.parse_args()
return config

48
codes/DoubleDQN/plot.py Normal file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-12-22 15:24:31
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from params import ALGO_NAME
def plot(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of '+ALGO_NAME)
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
# plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards,ylabel='rewards_'+tag)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

21
codes/DoubleDQN/utils.py Normal file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-15 21:28:00
LastEditor: John
LastEditTime: 2020-10-15 21:50:30
Discription:
Environment:
'''
import os
import numpy as np
def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./result'):
if not os.path.exists(result_path): # 检测是否存在文件夹
os.mkdir(result_path)
np.save(result_path+'rewards_'+tag+'.npy', rewards)
np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards)
np.save(result_path+'steps_'+tag+'.npy',ep_steps )

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-09-01 22:46:43
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
def plot(item,ylabel='rewards',save_fig = True):
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DQN')
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
if __name__ == "__main__":
output_path = os.path.dirname(__file__)+"/result/"
rewards=np.load(output_path+"rewards.npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards')

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB