update DoubleDQN

This commit is contained in:
JohnJim0816
2020-12-22 16:07:09 +08:00
parent 09f983120a
commit ac1d4b0a74
54 changed files with 264 additions and 146 deletions

33
codes/DoubleDQN/README.md Normal file
View File

@@ -0,0 +1,33 @@
## 思路
见[博客](https://blog.csdn.net/JohnJim0/article/details/111552545)
## 环境
python 3.7.9
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
## 使用
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
可视化
```python
tensorboard --logdir logs
```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2020-09-01 22:54:02 LastEditTime: 2020-12-22 14:44:46
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -13,8 +13,6 @@ LastEditTime: 2020-09-01 22:54:02
''' '''
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
@@ -46,32 +44,41 @@ class DQN:
self.loss = 0 self.loss = 0
self.memory = ReplayBuffer(memory_capacity) self.memory = ReplayBuffer(memory_capacity)
def select_action(self, state): def choose_action(self, state, train=True):
'''选择动作 '''选择动作
Args:
state [array]: [description]
Returns:
action [array]: [description]
''' '''
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ if train:
math.exp(-1. * self.actions_count / self.epsilon_decay) self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
self.actions_count += 1 math.exp(-1. * self.actions_count / self.epsilon_decay)
if random.random() > self.epsilon: self.actions_count += 1
with torch.no_grad(): if random.random() > self.epsilon:
# 先转为张量便于丢给神经网络,state元素数据原本为float64 with torch.no_grad():
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 # 先转为张量便于丢给神经网络,state元素数据原本为float64
state = torch.tensor( # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
[state], device=self.device, dtype=torch.float32) state = torch.tensor(
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) [state], device=self.device, dtype=torch.float32)
q_value = self.policy_net(state) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
# tensor.max(1)返回每行的最大值以及对应的下标, q_value = self.policy_net(state)
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # tensor.max(1)返回每行的最大值以及对应的下标,
# 所以tensor.max(1)[1]返回最大值对应的下标即action # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
action = q_value.max(1)[1].item() # 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
else: else:
action = random.randrange(self.n_actions) with torch.no_grad():
return action # 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device='cpu', dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.target_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
return action
def update(self): def update(self):
if len(self.memory) < self.batch_size: if len(self.memory) < self.batch_size:
@@ -93,19 +100,25 @@ class DQN:
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量 done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
# 计算当前(s_t,a)对应的Q(s_t, a) # 计算当前(s_t,a)对应的Q(s_t, a)
# 关于torch.gather,对于a=torch.Tensor([[1,2],[3,4]]) q_values = self.policy_net(state_batch)
# 那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]]) next_q_values = self.policy_net(state_batch)
q_values = self.policy_net(state_batch).gather( # 代入当前选择的action得到Q(s_t|a=a_t)
dim=1, index=action_batch) # 等价于self.forward q_value = q_values.gather(dim=1, index=action_batch)
# 计算所有next states的V(s_{t+1})即通过target_net中选取reward最大的对应states '''以下是Nature DQN的q_target计算方式
next_state_values = self.target_net( # 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 expected_q_value # 计算 q_target
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward # 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * \ q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
next_state_values * (1-done_batch[0]) '''
# self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss '''以下是Double DQNq_target计算方式与NatureDQN稍有不同'''
self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss next_target_values = self.target_net(
next_state_batch)
# 选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
# 优化模型 # 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
@@ -118,4 +131,4 @@ class DQN:
torch.save(self.target_net.state_dict(), path) torch.save(self.target_net.state_dict(), path)
def load_model(self,path): def load_model(self,path):
self.policy_net.load_state_dict(torch.load(path)) self.target_net.load_state_dict(torch.load(path))

View File

@@ -5,45 +5,21 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57 @Date: 2020-06-12 00:48:57
@LastEditor: John @LastEditor: John
LastEditTime: 2020-09-01 22:54:23 LastEditTime: 2020-12-22 15:39:46
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import gym import gym
import torch import torch
from dqn import DQN from torch.utils.tensorboard import SummaryWriter
from plot import plot import os
import argparse from agent import DQN
from params import SEQUENCE,SAVED_MODEL_PATH,RESULT_PATH
from params import get_args
from utils import save_results
def get_args(): def train(cfg):
'''模型参数 print('Start to train !')
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.05, type=float)
parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--train_steps", default=200, type=int) # 训练每个episode的长度
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200, type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
def train():
cfg = get_args()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
@@ -54,11 +30,13 @@ def train():
rewards = [] rewards = []
moving_average_rewards = [] moving_average_rewards = []
ep_steps = [] ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.train_eps+1): for i_episode in range(1, cfg.train_eps+1):
state = env.reset() # reset环境状态 state = env.reset() # reset环境状态
ep_reward = 0 ep_reward = 0
for i_step in range(1, cfg.train_steps+1): for i_step in range(1, cfg.train_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数 next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward ep_reward += reward
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
@@ -79,54 +57,48 @@ def train():
else: else:
moving_average_rewards.append( moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward) 0.9*moving_average_rewards[-1]+0.1*ep_reward)
import os writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
import numpy as np writer.add_scalar('steps_of_each_episode',
save_path = os.path.dirname(__file__)+"/saved_model/" ep_steps[-1], i_episode)
if not os.path.exists(save_path): writer.close()
os.mkdir(save_path) print('Complete training')
agent.save_model(save_path+'checkpoint.pth') ''' 保存模型 '''
# 存储reward等相关结果 if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
output_path = os.path.dirname(__file__)+"/result/" os.mkdir(SAVED_MODEL_PATH)
# 检测是否存在文件夹 agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
if not os.path.exists(output_path): print('model saved')
os.mkdir(output_path) '''存储reward等相关结果'''
np.save(output_path+"rewards.npy", rewards) save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
np.save(output_path+"steps.npy", ep_steps)
print('Complete')
plot(rewards)
plot(moving_average_rewards, ylabel="moving_average_rewards")
plot(ep_steps, ylabel="steps_of_each_episode")
def eval():
cfg = get_args() def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
# if gpu is to be used print('start to eval !')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0] n_states = env.observation_space.shape[0]
n_actions = env.action_space.n n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size) epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
import os agent.load_model(saved_model_path+'checkpoint.pth')
save_path = os.path.dirname(__file__)+"/saved_model/"
if not os.path.exists(save_path):
os.mkdir(save_path)
agent.load_model(save_path+'checkpoint.pth')
rewards = [] rewards = []
moving_average_rewards = [] moving_average_rewards = []
ep_steps = [] ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.eval_eps+1): for i_episode in range(1, cfg.eval_eps+1):
state = env.reset() # reset环境状态 state = env.reset() # reset环境状态
ep_reward = 0 ep_reward = 0
for i_step in range(1, cfg.eval_steps+1): for i_step in range(1, cfg.eval_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action action = agent.choose_action(state,train=False) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数 next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward ep_reward += reward
state = next_state # 跳转到下一个状态 state = next_state # 跳转到下一个状态
if done: if done:
break break
print('Episode:', i_episode, ' Reward: %i' % print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon) int(ep_reward), 'n_steps:', i_step, 'done: ', done)
ep_steps.append(i_step) ep_steps.append(i_step)
rewards.append(ep_reward) rewards.append(ep_reward)
# 计算滑动窗口的reward # 计算滑动窗口的reward
@@ -135,9 +107,20 @@ def eval():
else: else:
moving_average_rewards.append( moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward) 0.9*moving_average_rewards[-1]+0.1*ep_reward)
plot(rewards,save_fig=False)
plot(moving_average_rewards, ylabel="moving_average_rewards",save_fig=False) writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
plot(ep_steps, ylabel="steps_of_each_episode",save_fig=False) writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,ep_steps,tag='eval',result_path=RESULT_PATH)
print('Complete evaling')
if __name__ == "__main__": if __name__ == "__main__":
# train() cfg = get_args()
eval() if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16 @Date: 2020-06-10 15:27:16
@LastEditor: John @LastEditor: John
@LastEditTime: 2020-06-14 11:36:24 LastEditTime: 2020-12-22 12:56:27
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -15,21 +15,27 @@ import numpy as np
class ReplayBuffer: class ReplayBuffer:
def __init__(self, capacity): def __init__(self, capacity):
self.capacity = capacity self.capacity = capacity # buffer的最大容量
self.buffer = [] self.buffer = []
self.position = 0 self.position = 0
def push(self, state, action, reward, next_state, done): def push(self, state, action, reward, next_state, done):
'''以队列的方式将样本填入buffer中
'''
if len(self.buffer) < self.capacity: if len(self.buffer) < self.capacity:
self.buffer.append(None) self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done) self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity self.position = (self.position + 1) % self.capacity
def sample(self, batch_size): def sample(self, batch_size):
'''随机采样batch_size个样本
'''
batch = random.sample(self.buffer, batch_size) batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = zip(*batch) state, action, reward, next_state, done = zip(*batch)
return state, action, reward, next_state, done return state, action, reward, next_state, done
def __len__(self): def __len__(self):
'''返回buffer的长度
'''
return len(self.buffer) return len(self.buffer)

48
codes/DoubleDQN/params.py Normal file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-12-22 15:22:17
LastEditor: John
LastEditTime: 2020-12-22 15:26:09
Discription:
Environment:
'''
import datetime
import os
import argparse
ALGO_NAME = 'Double DQN'
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def get_args():
'''模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.01, type=float)
parser.add_argument("--epsilon_decay", default=500, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--train_steps", default=200, type=int)
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 2 eisodes) to update target net ") # 更新频率
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
config = parser.parse_args()
return config

48
codes/DoubleDQN/plot.py Normal file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-12-22 15:24:31
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from params import ALGO_NAME
def plot(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of '+ALGO_NAME)
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
# plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards,ylabel='rewards_'+tag)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

21
codes/DoubleDQN/utils.py Normal file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-15 21:28:00
LastEditor: John
LastEditTime: 2020-10-15 21:50:30
Discription:
Environment:
'''
import os
import numpy as np
def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./result'):
if not os.path.exists(result_path): # 检测是否存在文件夹
os.mkdir(result_path)
np.save(result_path+'rewards_'+tag+'.npy', rewards)
np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards)
np.save(result_path+'steps_'+tag+'.npy',ep_steps )

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-09-01 22:46:43
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
def plot(item,ylabel='rewards',save_fig = True):
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DQN')
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
if __name__ == "__main__":
output_path = os.path.dirname(__file__)+"/result/"
rewards=np.load(output_path+"rewards.npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards')

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB