hot update PG

This commit is contained in:
johnjim0816
2022-08-25 21:00:53 +08:00
parent 4f4658503e
commit 80f20c73be
34 changed files with 1391 additions and 1695 deletions

View File

@@ -1,318 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 定义模型\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import paddle\n",
"import paddle.nn as nn\n",
"import paddle.nn.functional as F\n",
"import parl\n",
"\n",
"class CartpoleModel(parl.Model):\n",
" \"\"\" Linear network to solve Cartpole problem.\n",
" Args:\n",
" n_states (int): Dimension of observation space.\n",
" n_actions (int): Dimension of action space.\n",
" \"\"\"\n",
"\n",
" def __init__(self, n_states, n_actions):\n",
" super(CartpoleModel, self).__init__()\n",
" hid1_size = 128\n",
" hid2_size = 128\n",
" self.fc1 = nn.Linear(n_states, hid1_size)\n",
" self.fc2 = nn.Linear(hid1_size, hid2_size)\n",
" self.fc3 = nn.Linear(hid2_size, n_actions)\n",
"\n",
" def forward(self, obs):\n",
" h1 = F.relu(self.fc1(obs))\n",
" h2 = F.relu(self.fc2(h1))\n",
" Q = self.fc3(h2)\n",
" return Q"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import parl\n",
"import paddle\n",
"import numpy as np\n",
"\n",
"\n",
"class CartpoleAgent(parl.Agent):\n",
" \"\"\"Agent of Cartpole env.\n",
" Args:\n",
" algorithm(parl.Algorithm): algorithm used to solve the problem.\n",
" \"\"\"\n",
"\n",
" def __init__(self, algorithm, n_actions, e_greed=0.1, e_greed_decrement=0):\n",
" super(CartpoleAgent, self).__init__(algorithm)\n",
" assert isinstance(n_actions, int)\n",
" self.n_actions = n_actions\n",
"\n",
" self.global_step = 0\n",
" self.update_target_steps = 200\n",
"\n",
" self.e_greed = e_greed\n",
" self.e_greed_decrement = e_greed_decrement\n",
"\n",
" def sample(self, obs):\n",
" \"\"\"Sample an action `for exploration` when given an observation\n",
" Args:\n",
" obs(np.float32): shape of (n_states,)\n",
" Returns:\n",
" act(int): action\n",
" \"\"\"\n",
" sample = np.random.random()\n",
" if sample < self.e_greed:\n",
" act = np.random.randint(self.n_actions)\n",
" else:\n",
" if np.random.random() < 0.01:\n",
" act = np.random.randint(self.n_actions)\n",
" else:\n",
" act = self.predict(obs)\n",
" self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)\n",
" return act\n",
"\n",
" def predict(self, obs):\n",
" \"\"\"Predict an action when given an observation\n",
" Args:\n",
" obs(np.float32): shape of (n_states,)\n",
" Returns:\n",
" act(int): action\n",
" \"\"\"\n",
" obs = paddle.to_tensor(obs, dtype='float32')\n",
" pred_q = self.alg.predict(obs)\n",
" act = pred_q.argmax().numpy()[0]\n",
" return act\n",
"\n",
" def learn(self, obs, act, reward, next_obs, terminal):\n",
" \"\"\"Update model with an episode data\n",
" Args:\n",
" obs(np.float32): shape of (batch_size, n_states)\n",
" act(np.int32): shape of (batch_size)\n",
" reward(np.float32): shape of (batch_size)\n",
" next_obs(np.float32): shape of (batch_size, n_states)\n",
" terminal(np.float32): shape of (batch_size)\n",
" Returns:\n",
" loss(float)\n",
" \"\"\"\n",
" if self.global_step % self.update_target_steps == 0:\n",
" self.alg.sync_target()\n",
" self.global_step += 1\n",
"\n",
" act = np.expand_dims(act, axis=-1)\n",
" reward = np.expand_dims(reward, axis=-1)\n",
" terminal = np.expand_dims(terminal, axis=-1)\n",
"\n",
" obs = paddle.to_tensor(obs, dtype='float32')\n",
" act = paddle.to_tensor(act, dtype='int32')\n",
" reward = paddle.to_tensor(reward, dtype='float32')\n",
" next_obs = paddle.to_tensor(next_obs, dtype='float32')\n",
" terminal = paddle.to_tensor(terminal, dtype='float32')\n",
" loss = self.alg.learn(obs, act, reward, next_obs, terminal)\n",
" return loss.numpy()[0]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import gym\n",
"import numpy as np\n",
"import parl\n",
"\n",
"from parl.utils import logger, ReplayMemory\n",
"from parl.algorithms import DQN\n",
"\n",
"LEARN_FREQ = 5 # training frequency\n",
"MEMORY_SIZE = 200000\n",
"MEMORY_WARMUP_SIZE = 200\n",
"BATCH_SIZE = 64\n",
"LEARNING_RATE = 0.0005\n",
"GAMMA = 0.99\n",
"\n",
"# train an episode\n",
"def run_train_episode(agent, env, rpm):\n",
" total_reward = 0\n",
" obs = env.reset()\n",
" step = 0\n",
" while True:\n",
" step += 1\n",
" action = agent.sample(obs)\n",
" next_obs, reward, done, _ = env.step(action)\n",
" rpm.append(obs, action, reward, next_obs, done)\n",
"\n",
" # train model\n",
" if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):\n",
" # s,a,r,s',done\n",
" (batch_obs, batch_action, batch_reward, batch_next_obs,\n",
" batch_done) = rpm.sample_batch(BATCH_SIZE)\n",
" train_loss = agent.learn(batch_obs, batch_action, batch_reward,\n",
" batch_next_obs, batch_done)\n",
"\n",
" total_reward += reward\n",
" obs = next_obs\n",
" if done:\n",
" break\n",
" return total_reward\n",
"\n",
"\n",
"# evaluate 5 episodes\n",
"def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):\n",
" eval_reward = []\n",
" for i in range(eval_episodes):\n",
" obs = env.reset()\n",
" episode_reward = 0\n",
" while True:\n",
" action = agent.predict(obs)\n",
" obs, reward, done, _ = env.step(action)\n",
" episode_reward += reward\n",
" if render:\n",
" env.render()\n",
" if done:\n",
" break\n",
" eval_reward.append(episode_reward)\n",
" return np.mean(eval_reward)\n",
"\n",
"\n",
"def main(args):\n",
" env = gym.make('CartPole-v0')\n",
" n_states = env.observation_space.shape[0]\n",
" n_actions = env.action_space.n\n",
" logger.info('n_states {}, n_actions {}'.format(n_states, n_actions))\n",
"\n",
" # set action_shape = 0 while in discrete control environment\n",
" rpm = ReplayMemory(MEMORY_SIZE, n_states, 0)\n",
"\n",
" # build an agent\n",
" model = CartpoleModel(n_states=n_states, n_actions=n_actions)\n",
" alg = DQN(model, gamma=GAMMA, lr=LEARNING_RATE)\n",
" agent = CartpoleAgent(\n",
" alg, n_actions=n_actions, e_greed=0.1, e_greed_decrement=1e-6)\n",
"\n",
" # warmup memory\n",
" while len(rpm) < MEMORY_WARMUP_SIZE:\n",
" run_train_episode(agent, env, rpm)\n",
"\n",
" max_episode = args.max_episode\n",
"\n",
" # start training\n",
" episode = 0\n",
" while episode < max_episode:\n",
" # train part\n",
" for i in range(50):\n",
" total_reward = run_train_episode(agent, env, rpm)\n",
" episode += 1\n",
"\n",
" # test part\n",
" eval_reward = run_evaluate_episodes(agent, env, render=False)\n",
" logger.info('episode:{} e_greed:{} Test reward:{}'.format(\n",
" episode, agent.e_greed, eval_reward))\n",
"\n",
" # save the parameters to ./model.ckpt\n",
" save_path = './model.ckpt'\n",
" agent.save(save_path)\n",
"\n",
" # save the model and parameters of policy network for inference\n",
" save_inference_path = './inference_model'\n",
" input_shapes = [[None, env.observation_space.shape[0]]]\n",
" input_dtypes = ['float32']\n",
" agent.save_inference_model(save_inference_path, input_shapes, input_dtypes)\n",
"\n",
"\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:64]\u001b[0m obs_dim 4, act_dim 2\n",
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:92]\u001b[0m episode:50 e_greed:0.0988929999999989 Test reward:18.4\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:100 e_greed:0.09794799999999795 Test reward:9.6\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:150 e_greed:0.0973899999999974 Test reward:37.8\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:200 e_greed:0.09684299999999685 Test reward:8.8\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:250 e_greed:0.09635499999999636 Test reward:9.4\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:300 e_greed:0.09585299999999586 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:350 e_greed:0.09535799999999536 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:400 e_greed:0.09486399999999487 Test reward:10.0\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:450 e_greed:0.09435299999999436 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:500 e_greed:0.09384899999999385 Test reward:9.4\n",
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:550 e_greed:0.09302299999999303 Test reward:69.0\n",
"\u001b[32m[08-01 21:48:25 MainThread @3996942455.py:92]\u001b[0m episode:600 e_greed:0.08774199999998775 Test reward:141.2\n",
"\u001b[32m[08-01 21:48:30 MainThread @3996942455.py:92]\u001b[0m episode:650 e_greed:0.0791019999999791 Test reward:184.0\n",
"\u001b[32m[08-01 21:48:35 MainThread @3996942455.py:92]\u001b[0m episode:700 e_greed:0.07011299999997012 Test reward:182.0\n",
"\u001b[32m[08-01 21:48:40 MainThread @3996942455.py:92]\u001b[0m episode:750 e_greed:0.06089099999996089 Test reward:197.4\n",
"\u001b[32m[08-01 21:48:45 MainThread @3996942455.py:92]\u001b[0m episode:800 e_greed:0.05139199999995139 Test reward:183.4\n",
"\u001b[32m[08-01 21:48:50 MainThread @3996942455.py:92]\u001b[0m episode:850 e_greed:0.042255999999942256 Test reward:153.0\n",
"\u001b[32m[08-01 21:48:55 MainThread @3996942455.py:92]\u001b[0m episode:900 e_greed:0.033495999999933496 Test reward:192.6\n",
"\u001b[32m[08-01 21:49:00 MainThread @3996942455.py:92]\u001b[0m episode:950 e_greed:0.024318999999924318 Test reward:166.6\n",
"\u001b[32m[08-01 21:49:06 MainThread @3996942455.py:92]\u001b[0m episode:1000 e_greed:0.014873999999916176 Test reward:187.0\n"
]
}
],
"source": [
"import argparse\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument(\n",
" '--max_episode',\n",
" type=int,\n",
" default=1000,\n",
" help='stop condition: number of max episode')\n",
"args = parser.parse_args(args=[])\n",
"\n",
"main(args)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.12 ('rl_tutorials')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4f613f1ab80ec98dc1b91d6e720de51301598a187317378e53e49b773c1123dd"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,11 +0,0 @@
[PARL](https://github.com/PaddlePaddle/PARL)是一个高性能、灵活的强化学习框架由百度AI Studio开发。
## 安装
1. 安装parl参考[PARL Github](https://github.com/PaddlePaddle/PARL)
2. 安装paddlepaddle```pip install paddlepaddle```
## 常见问题
```jupyter-client 7.3.1 requires pyzmq>=22.3, but you have pyzmq 18.1.1 which is incompatible.```:
```pip install -U pyzmq```

View File

@@ -11,7 +11,6 @@
项目内容主要包含以下几个部分:
* [Jupyter Notebook](./notebooks/)使用Notebook写的算法有比较详细的实战引导推荐新手食用
* [codes](./codes/)这些是基于Python脚本写的算法风格比较接近实际项目的写法推荐有一定代码基础的人阅读下面会说明其具体的一些架构
* [parl](./PARL/):应业务需求,写了一些基于百度飞浆平台和```parl```模块的RL实例
* [附件](./assets/):目前包含强化学习各算法的中文伪代码

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-08-25 20:59:23
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import torch
import datetime
import argparse
from itertools import count
import torch.nn.functional as F
from pg import PolicyGradient
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
from common.models import MLP
from common.memories import PGReplay
from common.launcher import Launcher
from envs.register import register_env
class PGNet(MLP):
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
'''
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
class Main(Launcher):
def get_args(self):
""" Hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=0.005,type=float,help="learning rate")
parser.add_argument('--update_fre',default=8,type=int)
parser.add_argument('--hidden_dim',default=36,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg['seed'])
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n # action dimension
print(f"state dim: {n_states}, action dim: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
model = PGNet(n_states,1,hidden_dim=cfg['hidden_dim'])
memory = PGReplay()
agent = PolicyGradient(model,memory,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = []
for i_ep in range(cfg['train_eps']):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
agent.memory.push((state,float(action),reward))
state = next_state
if done:
print(f"Episode{i_ep+1}/{cfg['train_eps']}, Reward:{ep_reward:.2f}")
break
if (i_ep+1) % cfg['update_fre'] == 0:
agent.update()
rewards.append(ep_reward)
print('Finish training!')
env.close() # close environment
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
return res_dic
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = []
for i_ep in range(cfg['test_eps']):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
break
rewards.append(ep_reward)
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -1,16 +0,0 @@
{
"algo_name": "PolicyGradient",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.99,
"lr": 0.005,
"update_fre": 8,
"hidden_dim": 36,
"device": "cpu",
"seed": 1,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/models/",
"save_fig": true,
"show_fig": false
}

View File

@@ -0,0 +1 @@
{"algo_name": "PolicyGradient", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.99, "lr": 0.005, "update_fre": 8, "hidden_dim": 36, "device": "cpu", "seed": 1, "save_fig": true, "show_fig": false, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220825-205930/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220825-205930/models/", "n_states": 4, "n_actions": 2}

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44
LastEditor: John
LastEditTime: 2022-08-22 17:35:34
LastEditTime: 2022-08-25 20:58:59
Discription:
Environment:
'''
@@ -19,12 +19,12 @@ import numpy as np
class PolicyGradient:
def __init__(self, n_states,model,memory,cfg):
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
def __init__(self, model,memory,cfg):
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.memory = memory
self.policy_net = model.to(self.device)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg['lr'])
def sample_action(self,state):

View File

@@ -1,139 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-08-22 17:40:07
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import torch
import datetime
import argparse
from itertools import count
import torch.nn.functional as F
from pg import PolicyGradient
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
from common.models import MLP
from common.memories import PGReplay
def get_args():
""" Hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=0.005,type=float,help="learning rate")
parser.add_argument('--update_fre',default=8,type=int)
parser.add_argument('--hidden_dim',default=36,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
args = parser.parse_args([])
return args
class PGNet(MLP):
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
'''
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
def env_agent_config(cfg):
env = gym.make(cfg.env_name)
if cfg.seed !=0: # set random seed
all_seed(env,seed=cfg.seed)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n # action dimension
print(f"state dim: {n_states}, action dim: {n_actions}")
model = PGNet(n_states,1,hidden_dim=cfg.hidden_dim)
memory = PGReplay()
agent = PolicyGradient(n_states,model,memory,cfg)
return env,agent
def train(cfg,env,agent):
print('Start training!')
print(f'Env:{cfg.env_name}, Algo:{cfg.algo_name}, Device:{cfg.device}')
rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
agent.memory.push((state,float(action),reward))
state = next_state
if done:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
break
if (i_ep+1) % cfg.update_fre == 0:
agent.update()
rewards.append(ep_reward)
print('Finish training!')
env.close() # close environment
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
return res_dic
def test(cfg,env,agent):
print("start testing!")
print(f"Env: {cfg.env_name}, Algo: {cfg.algo_name}, Device: {cfg.device}")
rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print(f'Episode: {i_ep+1}/{cfg.test_eps}Reward: {ep_reward:.2f}')
break
rewards.append(ep_reward)
print("finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards}
if __name__ == "__main__":
cfg = get_args()
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg.result_path) # save parameters
agent.save_model(path = cfg.model_path) # save models
save_results(res_dic, tag = 'train', path = cfg.result_path) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg.model_path) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test")

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2022-08-24 11:27:01
LastEditTime: 2022-08-25 14:59:15
Discription:
Environment:
'''
@@ -18,136 +18,102 @@ sys.path.append(parent_path) # add path to system path
import gym
import datetime
import argparse
from envs.gridworld_env import CliffWalkingWapper,FrozenLakeWapper
from envs.gridworld_env import FrozenLakeWapper
from envs.wrappers import CliffWalkingWapper
from envs.register import register_env
from qlearning import QLearning
from common.utils import plot_rewards,save_args,all_seed
from common.utils import save_results,make_dir
def get_args():
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(cfg):
''' create env and agent
'''
if cfg['env_name'] == 'CliffWalking-v0':
env = gym.make(cfg['env_name'])
env = CliffWalkingWapper(env)
if cfg['env_name'] == 'FrozenLake-v1':
env = gym.make(cfg['env_name'],is_slippery=False)
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
n_states = env.observation_space.n # state dimension
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
agent = QLearning(cfg)
return env,agent
def main(cfg,env,agent,tag = 'train'):
print(f"Start {tag}ing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合
while True:
if tag == 'train':action = agent.sample_action(state) # 根据算法采样一个动作
else: agent.predict_action(state)
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
if tag == 'train':agent.update(state, action, reward, next_state, done) # Q学习算法更新
state = next_state # 更新状态
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print(f"Finish {tag}ing!")
return {"rewards":rewards}
def train(cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
while True:
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.update(state, action, reward, next_state, done) # update agent
state = next_state # update state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
while True:
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
from common.utils import all_seed
from common.launcher import Launcher
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['env_name'] == 'CliffWalking-v0':
env = CliffWalkingWapper(env)
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
n_states = env.observation_space.n # state dimension
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
agent = QLearning(cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
while True:
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.update(state, action, reward, next_state, done) # update agent
state = next_state # update state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
while True:
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
cfg = get_args()
# training
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg['result_path']) # save parameters
agent.save_model(path = cfg['model_path']) # save models
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg['model_path']) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg['result_path'])
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
main = Main()
main.run()

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

View File

@@ -1,801 +0,0 @@
episodes,rewards,steps
0,0.0,20
1,0.0,14
2,0.0,13
3,0.0,9
4,0.0,10
5,0.0,6
6,0.0,11
7,0.0,6
8,0.0,3
9,0.0,9
10,0.0,11
11,0.0,22
12,0.0,5
13,0.0,16
14,0.0,4
15,0.0,9
16,0.0,18
17,0.0,2
18,0.0,4
19,0.0,8
20,0.0,7
21,0.0,4
22,0.0,22
23,0.0,15
24,0.0,5
25,0.0,16
26,0.0,7
27,0.0,19
28,0.0,22
29,0.0,16
30,0.0,11
31,0.0,22
32,0.0,28
33,0.0,23
34,0.0,4
35,0.0,11
36,0.0,8
37,0.0,15
38,0.0,5
39,0.0,7
40,0.0,9
41,0.0,4
42,0.0,3
43,0.0,6
44,0.0,41
45,0.0,9
46,0.0,23
47,0.0,3
48,1.0,38
49,0.0,29
50,0.0,17
51,0.0,4
52,0.0,2
53,0.0,25
54,0.0,6
55,0.0,2
56,0.0,30
57,0.0,6
58,0.0,7
59,0.0,11
60,0.0,9
61,0.0,8
62,0.0,23
63,0.0,10
64,0.0,3
65,0.0,5
66,0.0,7
67,0.0,18
68,0.0,8
69,0.0,26
70,0.0,6
71,0.0,14
72,0.0,4
73,0.0,25
74,0.0,21
75,0.0,13
76,0.0,4
77,0.0,29
78,0.0,21
79,0.0,6
80,0.0,6
81,0.0,11
82,0.0,21
83,0.0,9
84,0.0,9
85,0.0,7
86,0.0,48
87,0.0,23
88,0.0,100
89,0.0,60
90,0.0,7
91,0.0,10
92,0.0,24
93,0.0,4
94,0.0,7
95,0.0,17
96,0.0,87
97,0.0,28
98,0.0,7
99,0.0,5
100,0.0,12
101,0.0,14
102,0.0,6
103,0.0,13
104,0.0,93
105,0.0,4
106,0.0,50
107,0.0,8
108,0.0,12
109,0.0,43
110,0.0,30
111,0.0,15
112,0.0,19
113,0.0,100
114,0.0,82
115,0.0,40
116,0.0,88
117,0.0,19
118,0.0,30
119,0.0,27
120,0.0,5
121,0.0,87
122,0.0,9
123,0.0,64
124,0.0,27
125,0.0,68
126,0.0,81
127,0.0,86
128,0.0,100
129,0.0,100
130,0.0,27
131,0.0,41
132,0.0,70
133,0.0,27
134,0.0,6
135,0.0,18
136,0.0,38
137,0.0,26
138,0.0,36
139,0.0,3
140,0.0,61
141,0.0,100
142,0.0,4
143,0.0,39
144,0.0,18
145,0.0,33
146,0.0,29
147,0.0,49
148,0.0,88
149,0.0,22
150,0.0,65
151,0.0,36
152,0.0,30
153,0.0,58
154,0.0,43
155,0.0,53
156,0.0,43
157,0.0,13
158,0.0,8
159,0.0,39
160,0.0,29
161,0.0,26
162,0.0,60
163,0.0,100
164,0.0,31
165,0.0,22
166,0.0,100
167,0.0,46
168,0.0,23
169,0.0,54
170,0.0,8
171,0.0,58
172,0.0,3
173,0.0,47
174,0.0,16
175,0.0,21
176,0.0,44
177,0.0,29
178,0.0,100
179,0.0,100
180,0.0,62
181,0.0,83
182,0.0,26
183,0.0,24
184,0.0,10
185,0.0,12
186,0.0,40
187,0.0,25
188,0.0,18
189,0.0,60
190,0.0,100
191,0.0,100
192,0.0,24
193,0.0,56
194,0.0,71
195,0.0,19
196,0.0,100
197,0.0,44
198,0.0,41
199,0.0,41
200,0.0,60
201,0.0,31
202,0.0,34
203,0.0,35
204,0.0,59
205,0.0,51
206,0.0,100
207,0.0,100
208,0.0,100
209,0.0,100
210,0.0,37
211,0.0,68
212,0.0,40
213,0.0,17
214,0.0,79
215,0.0,100
216,0.0,26
217,0.0,61
218,0.0,25
219,0.0,18
220,0.0,27
221,0.0,13
222,0.0,100
223,0.0,87
224,0.0,100
225,0.0,92
226,0.0,100
227,0.0,8
228,0.0,100
229,0.0,64
230,0.0,17
231,0.0,82
232,0.0,100
233,0.0,94
234,0.0,7
235,0.0,36
236,0.0,100
237,0.0,56
238,0.0,17
239,0.0,100
240,0.0,83
241,0.0,100
242,0.0,100
243,0.0,43
244,0.0,87
245,0.0,42
246,0.0,80
247,0.0,54
248,0.0,82
249,0.0,97
250,0.0,65
251,0.0,83
252,0.0,100
253,0.0,59
254,0.0,100
255,0.0,78
256,0.0,100
257,0.0,100
258,0.0,43
259,0.0,80
260,0.0,100
261,0.0,70
262,0.0,94
263,0.0,100
264,0.0,100
265,0.0,37
266,0.0,11
267,0.0,31
268,0.0,100
269,0.0,34
270,0.0,32
271,0.0,58
272,0.0,38
273,0.0,28
274,0.0,100
275,0.0,59
276,0.0,100
277,0.0,82
278,0.0,51
279,0.0,25
280,0.0,73
281,0.0,56
282,0.0,55
283,0.0,38
284,0.0,100
285,0.0,100
286,0.0,92
287,0.0,100
288,0.0,100
289,0.0,100
290,0.0,37
291,0.0,100
292,0.0,66
293,0.0,24
294,0.0,17
295,0.0,100
296,0.0,59
297,0.0,25
298,0.0,73
299,0.0,100
300,0.0,29
301,0.0,100
302,0.0,72
303,0.0,6
304,1.0,57
305,0.0,47
306,0.0,48
307,0.0,13
308,0.0,100
309,0.0,38
310,0.0,100
311,0.0,20
312,0.0,100
313,0.0,100
314,0.0,5
315,0.0,39
316,0.0,11
317,0.0,83
318,0.0,42
319,0.0,100
320,0.0,99
321,0.0,83
322,0.0,28
323,0.0,46
324,0.0,100
325,0.0,100
326,0.0,62
327,0.0,100
328,0.0,23
329,0.0,91
330,0.0,53
331,0.0,19
332,0.0,26
333,0.0,93
334,0.0,38
335,0.0,22
336,0.0,43
337,0.0,100
338,0.0,90
339,0.0,18
340,0.0,45
341,0.0,65
342,1.0,22
343,0.0,100
344,1.0,15
345,1.0,72
346,0.0,5
347,1.0,6
348,1.0,6
349,1.0,9
350,1.0,8
351,1.0,9
352,1.0,8
353,1.0,6
354,1.0,6
355,1.0,10
356,1.0,6
357,0.0,5
358,0.0,3
359,1.0,6
360,1.0,6
361,1.0,6
362,1.0,6
363,1.0,8
364,1.0,6
365,1.0,8
366,1.0,6
367,1.0,6
368,1.0,8
369,1.0,6
370,1.0,6
371,0.0,5
372,1.0,6
373,0.0,6
374,1.0,6
375,1.0,12
376,1.0,6
377,1.0,6
378,1.0,9
379,1.0,6
380,1.0,6
381,0.0,2
382,0.0,3
383,0.0,2
384,0.0,4
385,0.0,3
386,1.0,7
387,1.0,6
388,1.0,6
389,1.0,8
390,1.0,9
391,1.0,8
392,1.0,8
393,1.0,6
394,1.0,6
395,1.0,7
396,1.0,6
397,0.0,5
398,0.0,5
399,1.0,10
400,1.0,6
401,0.0,3
402,1.0,6
403,1.0,7
404,1.0,6
405,1.0,6
406,1.0,6
407,1.0,6
408,1.0,6
409,1.0,6
410,1.0,6
411,0.0,5
412,1.0,6
413,1.0,6
414,0.0,2
415,1.0,6
416,1.0,6
417,1.0,6
418,1.0,6
419,1.0,6
420,1.0,8
421,1.0,6
422,1.0,6
423,1.0,6
424,1.0,6
425,1.0,7
426,0.0,5
427,1.0,6
428,1.0,6
429,1.0,6
430,1.0,8
431,1.0,6
432,1.0,6
433,1.0,6
434,1.0,6
435,0.0,2
436,1.0,8
437,1.0,7
438,1.0,6
439,1.0,7
440,1.0,6
441,1.0,6
442,0.0,3
443,0.0,4
444,1.0,6
445,1.0,6
446,1.0,7
447,1.0,6
448,1.0,6
449,1.0,6
450,1.0,6
451,1.0,6
452,1.0,6
453,1.0,8
454,1.0,6
455,1.0,6
456,1.0,6
457,1.0,6
458,1.0,6
459,1.0,7
460,1.0,8
461,1.0,6
462,1.0,7
463,1.0,6
464,1.0,6
465,1.0,6
466,1.0,6
467,1.0,8
468,1.0,6
469,1.0,6
470,1.0,8
471,1.0,6
472,1.0,11
473,1.0,6
474,1.0,6
475,1.0,6
476,1.0,8
477,0.0,2
478,1.0,7
479,1.0,6
480,1.0,6
481,1.0,7
482,1.0,6
483,1.0,6
484,1.0,6
485,1.0,6
486,0.0,3
487,1.0,7
488,1.0,6
489,1.0,6
490,1.0,6
491,0.0,3
492,1.0,6
493,1.0,7
494,1.0,12
495,1.0,6
496,0.0,9
497,1.0,6
498,1.0,6
499,0.0,8
500,1.0,6
501,0.0,3
502,0.0,5
503,0.0,3
504,1.0,6
505,1.0,6
506,1.0,6
507,1.0,6
508,1.0,6
509,1.0,6
510,1.0,6
511,1.0,6
512,1.0,6
513,1.0,6
514,0.0,2
515,1.0,7
516,1.0,6
517,1.0,6
518,1.0,6
519,1.0,6
520,1.0,6
521,1.0,7
522,0.0,4
523,1.0,6
524,0.0,5
525,1.0,6
526,1.0,6
527,1.0,6
528,1.0,6
529,0.0,3
530,1.0,6
531,1.0,6
532,1.0,6
533,1.0,7
534,1.0,8
535,1.0,6
536,1.0,6
537,1.0,6
538,1.0,6
539,1.0,7
540,1.0,7
541,1.0,7
542,1.0,8
543,1.0,6
544,1.0,10
545,1.0,6
546,1.0,6
547,1.0,6
548,1.0,8
549,1.0,6
550,1.0,6
551,1.0,8
552,1.0,6
553,1.0,7
554,1.0,6
555,1.0,7
556,1.0,6
557,1.0,6
558,1.0,7
559,1.0,7
560,1.0,7
561,1.0,6
562,1.0,6
563,1.0,6
564,1.0,6
565,1.0,6
566,1.0,6
567,1.0,6
568,1.0,7
569,0.0,4
570,1.0,8
571,1.0,8
572,1.0,7
573,1.0,6
574,1.0,8
575,1.0,6
576,1.0,6
577,1.0,7
578,1.0,6
579,1.0,6
580,1.0,8
581,1.0,7
582,1.0,6
583,1.0,6
584,0.0,3
585,1.0,11
586,1.0,6
587,1.0,8
588,0.0,2
589,1.0,6
590,1.0,6
591,1.0,6
592,1.0,6
593,1.0,8
594,1.0,6
595,1.0,7
596,1.0,6
597,1.0,7
598,1.0,6
599,1.0,8
600,0.0,2
601,1.0,6
602,1.0,7
603,1.0,6
604,1.0,6
605,1.0,10
606,1.0,7
607,1.0,6
608,1.0,6
609,1.0,6
610,1.0,6
611,1.0,6
612,1.0,7
613,0.0,4
614,1.0,7
615,1.0,6
616,1.0,8
617,0.0,3
618,1.0,6
619,1.0,6
620,1.0,6
621,1.0,6
622,0.0,2
623,1.0,6
624,1.0,6
625,1.0,6
626,1.0,6
627,1.0,6
628,1.0,7
629,1.0,6
630,1.0,6
631,1.0,7
632,1.0,6
633,1.0,6
634,1.0,6
635,1.0,6
636,1.0,6
637,1.0,6
638,1.0,6
639,1.0,8
640,1.0,6
641,1.0,8
642,1.0,7
643,1.0,6
644,0.0,3
645,1.0,6
646,1.0,7
647,1.0,6
648,1.0,6
649,1.0,6
650,1.0,10
651,1.0,6
652,1.0,6
653,1.0,6
654,1.0,6
655,1.0,10
656,1.0,6
657,1.0,8
658,1.0,8
659,1.0,7
660,1.0,6
661,0.0,5
662,0.0,2
663,1.0,8
664,1.0,6
665,1.0,10
666,1.0,6
667,1.0,8
668,1.0,10
669,1.0,6
670,1.0,6
671,1.0,6
672,1.0,10
673,1.0,6
674,0.0,4
675,1.0,6
676,1.0,6
677,1.0,6
678,1.0,15
679,1.0,6
680,1.0,6
681,1.0,6
682,1.0,6
683,1.0,6
684,1.0,6
685,1.0,8
686,1.0,6
687,1.0,7
688,1.0,6
689,1.0,6
690,1.0,8
691,1.0,6
692,1.0,6
693,1.0,8
694,1.0,8
695,1.0,6
696,1.0,6
697,1.0,6
698,1.0,10
699,1.0,6
700,1.0,6
701,1.0,6
702,1.0,6
703,1.0,6
704,1.0,6
705,1.0,6
706,1.0,8
707,1.0,8
708,1.0,6
709,1.0,6
710,0.0,2
711,1.0,6
712,1.0,6
713,1.0,6
714,1.0,8
715,1.0,6
716,1.0,6
717,1.0,6
718,1.0,6
719,1.0,6
720,1.0,6
721,1.0,6
722,1.0,6
723,1.0,6
724,1.0,7
725,0.0,3
726,1.0,7
727,1.0,6
728,1.0,6
729,1.0,6
730,0.0,2
731,1.0,6
732,1.0,8
733,1.0,6
734,1.0,6
735,1.0,6
736,1.0,6
737,1.0,9
738,1.0,6
739,1.0,6
740,1.0,6
741,1.0,6
742,1.0,6
743,1.0,6
744,1.0,9
745,1.0,7
746,0.0,4
747,1.0,6
748,1.0,8
749,1.0,11
750,1.0,6
751,1.0,6
752,1.0,6
753,1.0,6
754,1.0,6
755,1.0,8
756,1.0,6
757,1.0,6
758,1.0,8
759,1.0,7
760,1.0,6
761,1.0,8
762,1.0,6
763,0.0,5
764,1.0,9
765,1.0,8
766,1.0,8
767,1.0,6
768,1.0,8
769,1.0,8
770,1.0,6
771,0.0,5
772,0.0,3
773,0.0,2
774,1.0,8
775,1.0,6
776,1.0,6
777,1.0,6
778,1.0,6
779,1.0,6
780,1.0,6
781,1.0,6
782,1.0,6
783,1.0,6
784,1.0,6
785,1.0,6
786,1.0,6
787,1.0,6
788,1.0,6
789,0.0,2
790,1.0,6
791,0.0,4
792,1.0,6
793,1.0,6
794,1.0,6
795,1.0,6
796,1.0,6
797,1.0,8
798,0.0,5
799,1.0,6
1 episodes rewards steps
2 0 0.0 20
3 1 0.0 14
4 2 0.0 13
5 3 0.0 9
6 4 0.0 10
7 5 0.0 6
8 6 0.0 11
9 7 0.0 6
10 8 0.0 3
11 9 0.0 9
12 10 0.0 11
13 11 0.0 22
14 12 0.0 5
15 13 0.0 16
16 14 0.0 4
17 15 0.0 9
18 16 0.0 18
19 17 0.0 2
20 18 0.0 4
21 19 0.0 8
22 20 0.0 7
23 21 0.0 4
24 22 0.0 22
25 23 0.0 15
26 24 0.0 5
27 25 0.0 16
28 26 0.0 7
29 27 0.0 19
30 28 0.0 22
31 29 0.0 16
32 30 0.0 11
33 31 0.0 22
34 32 0.0 28
35 33 0.0 23
36 34 0.0 4
37 35 0.0 11
38 36 0.0 8
39 37 0.0 15
40 38 0.0 5
41 39 0.0 7
42 40 0.0 9
43 41 0.0 4
44 42 0.0 3
45 43 0.0 6
46 44 0.0 41
47 45 0.0 9
48 46 0.0 23
49 47 0.0 3
50 48 1.0 38
51 49 0.0 29
52 50 0.0 17
53 51 0.0 4
54 52 0.0 2
55 53 0.0 25
56 54 0.0 6
57 55 0.0 2
58 56 0.0 30
59 57 0.0 6
60 58 0.0 7
61 59 0.0 11
62 60 0.0 9
63 61 0.0 8
64 62 0.0 23
65 63 0.0 10
66 64 0.0 3
67 65 0.0 5
68 66 0.0 7
69 67 0.0 18
70 68 0.0 8
71 69 0.0 26
72 70 0.0 6
73 71 0.0 14
74 72 0.0 4
75 73 0.0 25
76 74 0.0 21
77 75 0.0 13
78 76 0.0 4
79 77 0.0 29
80 78 0.0 21
81 79 0.0 6
82 80 0.0 6
83 81 0.0 11
84 82 0.0 21
85 83 0.0 9
86 84 0.0 9
87 85 0.0 7
88 86 0.0 48
89 87 0.0 23
90 88 0.0 100
91 89 0.0 60
92 90 0.0 7
93 91 0.0 10
94 92 0.0 24
95 93 0.0 4
96 94 0.0 7
97 95 0.0 17
98 96 0.0 87
99 97 0.0 28
100 98 0.0 7
101 99 0.0 5
102 100 0.0 12
103 101 0.0 14
104 102 0.0 6
105 103 0.0 13
106 104 0.0 93
107 105 0.0 4
108 106 0.0 50
109 107 0.0 8
110 108 0.0 12
111 109 0.0 43
112 110 0.0 30
113 111 0.0 15
114 112 0.0 19
115 113 0.0 100
116 114 0.0 82
117 115 0.0 40
118 116 0.0 88
119 117 0.0 19
120 118 0.0 30
121 119 0.0 27
122 120 0.0 5
123 121 0.0 87
124 122 0.0 9
125 123 0.0 64
126 124 0.0 27
127 125 0.0 68
128 126 0.0 81
129 127 0.0 86
130 128 0.0 100
131 129 0.0 100
132 130 0.0 27
133 131 0.0 41
134 132 0.0 70
135 133 0.0 27
136 134 0.0 6
137 135 0.0 18
138 136 0.0 38
139 137 0.0 26
140 138 0.0 36
141 139 0.0 3
142 140 0.0 61
143 141 0.0 100
144 142 0.0 4
145 143 0.0 39
146 144 0.0 18
147 145 0.0 33
148 146 0.0 29
149 147 0.0 49
150 148 0.0 88
151 149 0.0 22
152 150 0.0 65
153 151 0.0 36
154 152 0.0 30
155 153 0.0 58
156 154 0.0 43
157 155 0.0 53
158 156 0.0 43
159 157 0.0 13
160 158 0.0 8
161 159 0.0 39
162 160 0.0 29
163 161 0.0 26
164 162 0.0 60
165 163 0.0 100
166 164 0.0 31
167 165 0.0 22
168 166 0.0 100
169 167 0.0 46
170 168 0.0 23
171 169 0.0 54
172 170 0.0 8
173 171 0.0 58
174 172 0.0 3
175 173 0.0 47
176 174 0.0 16
177 175 0.0 21
178 176 0.0 44
179 177 0.0 29
180 178 0.0 100
181 179 0.0 100
182 180 0.0 62
183 181 0.0 83
184 182 0.0 26
185 183 0.0 24
186 184 0.0 10
187 185 0.0 12
188 186 0.0 40
189 187 0.0 25
190 188 0.0 18
191 189 0.0 60
192 190 0.0 100
193 191 0.0 100
194 192 0.0 24
195 193 0.0 56
196 194 0.0 71
197 195 0.0 19
198 196 0.0 100
199 197 0.0 44
200 198 0.0 41
201 199 0.0 41
202 200 0.0 60
203 201 0.0 31
204 202 0.0 34
205 203 0.0 35
206 204 0.0 59
207 205 0.0 51
208 206 0.0 100
209 207 0.0 100
210 208 0.0 100
211 209 0.0 100
212 210 0.0 37
213 211 0.0 68
214 212 0.0 40
215 213 0.0 17
216 214 0.0 79
217 215 0.0 100
218 216 0.0 26
219 217 0.0 61
220 218 0.0 25
221 219 0.0 18
222 220 0.0 27
223 221 0.0 13
224 222 0.0 100
225 223 0.0 87
226 224 0.0 100
227 225 0.0 92
228 226 0.0 100
229 227 0.0 8
230 228 0.0 100
231 229 0.0 64
232 230 0.0 17
233 231 0.0 82
234 232 0.0 100
235 233 0.0 94
236 234 0.0 7
237 235 0.0 36
238 236 0.0 100
239 237 0.0 56
240 238 0.0 17
241 239 0.0 100
242 240 0.0 83
243 241 0.0 100
244 242 0.0 100
245 243 0.0 43
246 244 0.0 87
247 245 0.0 42
248 246 0.0 80
249 247 0.0 54
250 248 0.0 82
251 249 0.0 97
252 250 0.0 65
253 251 0.0 83
254 252 0.0 100
255 253 0.0 59
256 254 0.0 100
257 255 0.0 78
258 256 0.0 100
259 257 0.0 100
260 258 0.0 43
261 259 0.0 80
262 260 0.0 100
263 261 0.0 70
264 262 0.0 94
265 263 0.0 100
266 264 0.0 100
267 265 0.0 37
268 266 0.0 11
269 267 0.0 31
270 268 0.0 100
271 269 0.0 34
272 270 0.0 32
273 271 0.0 58
274 272 0.0 38
275 273 0.0 28
276 274 0.0 100
277 275 0.0 59
278 276 0.0 100
279 277 0.0 82
280 278 0.0 51
281 279 0.0 25
282 280 0.0 73
283 281 0.0 56
284 282 0.0 55
285 283 0.0 38
286 284 0.0 100
287 285 0.0 100
288 286 0.0 92
289 287 0.0 100
290 288 0.0 100
291 289 0.0 100
292 290 0.0 37
293 291 0.0 100
294 292 0.0 66
295 293 0.0 24
296 294 0.0 17
297 295 0.0 100
298 296 0.0 59
299 297 0.0 25
300 298 0.0 73
301 299 0.0 100
302 300 0.0 29
303 301 0.0 100
304 302 0.0 72
305 303 0.0 6
306 304 1.0 57
307 305 0.0 47
308 306 0.0 48
309 307 0.0 13
310 308 0.0 100
311 309 0.0 38
312 310 0.0 100
313 311 0.0 20
314 312 0.0 100
315 313 0.0 100
316 314 0.0 5
317 315 0.0 39
318 316 0.0 11
319 317 0.0 83
320 318 0.0 42
321 319 0.0 100
322 320 0.0 99
323 321 0.0 83
324 322 0.0 28
325 323 0.0 46
326 324 0.0 100
327 325 0.0 100
328 326 0.0 62
329 327 0.0 100
330 328 0.0 23
331 329 0.0 91
332 330 0.0 53
333 331 0.0 19
334 332 0.0 26
335 333 0.0 93
336 334 0.0 38
337 335 0.0 22
338 336 0.0 43
339 337 0.0 100
340 338 0.0 90
341 339 0.0 18
342 340 0.0 45
343 341 0.0 65
344 342 1.0 22
345 343 0.0 100
346 344 1.0 15
347 345 1.0 72
348 346 0.0 5
349 347 1.0 6
350 348 1.0 6
351 349 1.0 9
352 350 1.0 8
353 351 1.0 9
354 352 1.0 8
355 353 1.0 6
356 354 1.0 6
357 355 1.0 10
358 356 1.0 6
359 357 0.0 5
360 358 0.0 3
361 359 1.0 6
362 360 1.0 6
363 361 1.0 6
364 362 1.0 6
365 363 1.0 8
366 364 1.0 6
367 365 1.0 8
368 366 1.0 6
369 367 1.0 6
370 368 1.0 8
371 369 1.0 6
372 370 1.0 6
373 371 0.0 5
374 372 1.0 6
375 373 0.0 6
376 374 1.0 6
377 375 1.0 12
378 376 1.0 6
379 377 1.0 6
380 378 1.0 9
381 379 1.0 6
382 380 1.0 6
383 381 0.0 2
384 382 0.0 3
385 383 0.0 2
386 384 0.0 4
387 385 0.0 3
388 386 1.0 7
389 387 1.0 6
390 388 1.0 6
391 389 1.0 8
392 390 1.0 9
393 391 1.0 8
394 392 1.0 8
395 393 1.0 6
396 394 1.0 6
397 395 1.0 7
398 396 1.0 6
399 397 0.0 5
400 398 0.0 5
401 399 1.0 10
402 400 1.0 6
403 401 0.0 3
404 402 1.0 6
405 403 1.0 7
406 404 1.0 6
407 405 1.0 6
408 406 1.0 6
409 407 1.0 6
410 408 1.0 6
411 409 1.0 6
412 410 1.0 6
413 411 0.0 5
414 412 1.0 6
415 413 1.0 6
416 414 0.0 2
417 415 1.0 6
418 416 1.0 6
419 417 1.0 6
420 418 1.0 6
421 419 1.0 6
422 420 1.0 8
423 421 1.0 6
424 422 1.0 6
425 423 1.0 6
426 424 1.0 6
427 425 1.0 7
428 426 0.0 5
429 427 1.0 6
430 428 1.0 6
431 429 1.0 6
432 430 1.0 8
433 431 1.0 6
434 432 1.0 6
435 433 1.0 6
436 434 1.0 6
437 435 0.0 2
438 436 1.0 8
439 437 1.0 7
440 438 1.0 6
441 439 1.0 7
442 440 1.0 6
443 441 1.0 6
444 442 0.0 3
445 443 0.0 4
446 444 1.0 6
447 445 1.0 6
448 446 1.0 7
449 447 1.0 6
450 448 1.0 6
451 449 1.0 6
452 450 1.0 6
453 451 1.0 6
454 452 1.0 6
455 453 1.0 8
456 454 1.0 6
457 455 1.0 6
458 456 1.0 6
459 457 1.0 6
460 458 1.0 6
461 459 1.0 7
462 460 1.0 8
463 461 1.0 6
464 462 1.0 7
465 463 1.0 6
466 464 1.0 6
467 465 1.0 6
468 466 1.0 6
469 467 1.0 8
470 468 1.0 6
471 469 1.0 6
472 470 1.0 8
473 471 1.0 6
474 472 1.0 11
475 473 1.0 6
476 474 1.0 6
477 475 1.0 6
478 476 1.0 8
479 477 0.0 2
480 478 1.0 7
481 479 1.0 6
482 480 1.0 6
483 481 1.0 7
484 482 1.0 6
485 483 1.0 6
486 484 1.0 6
487 485 1.0 6
488 486 0.0 3
489 487 1.0 7
490 488 1.0 6
491 489 1.0 6
492 490 1.0 6
493 491 0.0 3
494 492 1.0 6
495 493 1.0 7
496 494 1.0 12
497 495 1.0 6
498 496 0.0 9
499 497 1.0 6
500 498 1.0 6
501 499 0.0 8
502 500 1.0 6
503 501 0.0 3
504 502 0.0 5
505 503 0.0 3
506 504 1.0 6
507 505 1.0 6
508 506 1.0 6
509 507 1.0 6
510 508 1.0 6
511 509 1.0 6
512 510 1.0 6
513 511 1.0 6
514 512 1.0 6
515 513 1.0 6
516 514 0.0 2
517 515 1.0 7
518 516 1.0 6
519 517 1.0 6
520 518 1.0 6
521 519 1.0 6
522 520 1.0 6
523 521 1.0 7
524 522 0.0 4
525 523 1.0 6
526 524 0.0 5
527 525 1.0 6
528 526 1.0 6
529 527 1.0 6
530 528 1.0 6
531 529 0.0 3
532 530 1.0 6
533 531 1.0 6
534 532 1.0 6
535 533 1.0 7
536 534 1.0 8
537 535 1.0 6
538 536 1.0 6
539 537 1.0 6
540 538 1.0 6
541 539 1.0 7
542 540 1.0 7
543 541 1.0 7
544 542 1.0 8
545 543 1.0 6
546 544 1.0 10
547 545 1.0 6
548 546 1.0 6
549 547 1.0 6
550 548 1.0 8
551 549 1.0 6
552 550 1.0 6
553 551 1.0 8
554 552 1.0 6
555 553 1.0 7
556 554 1.0 6
557 555 1.0 7
558 556 1.0 6
559 557 1.0 6
560 558 1.0 7
561 559 1.0 7
562 560 1.0 7
563 561 1.0 6
564 562 1.0 6
565 563 1.0 6
566 564 1.0 6
567 565 1.0 6
568 566 1.0 6
569 567 1.0 6
570 568 1.0 7
571 569 0.0 4
572 570 1.0 8
573 571 1.0 8
574 572 1.0 7
575 573 1.0 6
576 574 1.0 8
577 575 1.0 6
578 576 1.0 6
579 577 1.0 7
580 578 1.0 6
581 579 1.0 6
582 580 1.0 8
583 581 1.0 7
584 582 1.0 6
585 583 1.0 6
586 584 0.0 3
587 585 1.0 11
588 586 1.0 6
589 587 1.0 8
590 588 0.0 2
591 589 1.0 6
592 590 1.0 6
593 591 1.0 6
594 592 1.0 6
595 593 1.0 8
596 594 1.0 6
597 595 1.0 7
598 596 1.0 6
599 597 1.0 7
600 598 1.0 6
601 599 1.0 8
602 600 0.0 2
603 601 1.0 6
604 602 1.0 7
605 603 1.0 6
606 604 1.0 6
607 605 1.0 10
608 606 1.0 7
609 607 1.0 6
610 608 1.0 6
611 609 1.0 6
612 610 1.0 6
613 611 1.0 6
614 612 1.0 7
615 613 0.0 4
616 614 1.0 7
617 615 1.0 6
618 616 1.0 8
619 617 0.0 3
620 618 1.0 6
621 619 1.0 6
622 620 1.0 6
623 621 1.0 6
624 622 0.0 2
625 623 1.0 6
626 624 1.0 6
627 625 1.0 6
628 626 1.0 6
629 627 1.0 6
630 628 1.0 7
631 629 1.0 6
632 630 1.0 6
633 631 1.0 7
634 632 1.0 6
635 633 1.0 6
636 634 1.0 6
637 635 1.0 6
638 636 1.0 6
639 637 1.0 6
640 638 1.0 6
641 639 1.0 8
642 640 1.0 6
643 641 1.0 8
644 642 1.0 7
645 643 1.0 6
646 644 0.0 3
647 645 1.0 6
648 646 1.0 7
649 647 1.0 6
650 648 1.0 6
651 649 1.0 6
652 650 1.0 10
653 651 1.0 6
654 652 1.0 6
655 653 1.0 6
656 654 1.0 6
657 655 1.0 10
658 656 1.0 6
659 657 1.0 8
660 658 1.0 8
661 659 1.0 7
662 660 1.0 6
663 661 0.0 5
664 662 0.0 2
665 663 1.0 8
666 664 1.0 6
667 665 1.0 10
668 666 1.0 6
669 667 1.0 8
670 668 1.0 10
671 669 1.0 6
672 670 1.0 6
673 671 1.0 6
674 672 1.0 10
675 673 1.0 6
676 674 0.0 4
677 675 1.0 6
678 676 1.0 6
679 677 1.0 6
680 678 1.0 15
681 679 1.0 6
682 680 1.0 6
683 681 1.0 6
684 682 1.0 6
685 683 1.0 6
686 684 1.0 6
687 685 1.0 8
688 686 1.0 6
689 687 1.0 7
690 688 1.0 6
691 689 1.0 6
692 690 1.0 8
693 691 1.0 6
694 692 1.0 6
695 693 1.0 8
696 694 1.0 8
697 695 1.0 6
698 696 1.0 6
699 697 1.0 6
700 698 1.0 10
701 699 1.0 6
702 700 1.0 6
703 701 1.0 6
704 702 1.0 6
705 703 1.0 6
706 704 1.0 6
707 705 1.0 6
708 706 1.0 8
709 707 1.0 8
710 708 1.0 6
711 709 1.0 6
712 710 0.0 2
713 711 1.0 6
714 712 1.0 6
715 713 1.0 6
716 714 1.0 8
717 715 1.0 6
718 716 1.0 6
719 717 1.0 6
720 718 1.0 6
721 719 1.0 6
722 720 1.0 6
723 721 1.0 6
724 722 1.0 6
725 723 1.0 6
726 724 1.0 7
727 725 0.0 3
728 726 1.0 7
729 727 1.0 6
730 728 1.0 6
731 729 1.0 6
732 730 0.0 2
733 731 1.0 6
734 732 1.0 8
735 733 1.0 6
736 734 1.0 6
737 735 1.0 6
738 736 1.0 6
739 737 1.0 9
740 738 1.0 6
741 739 1.0 6
742 740 1.0 6
743 741 1.0 6
744 742 1.0 6
745 743 1.0 6
746 744 1.0 9
747 745 1.0 7
748 746 0.0 4
749 747 1.0 6
750 748 1.0 8
751 749 1.0 11
752 750 1.0 6
753 751 1.0 6
754 752 1.0 6
755 753 1.0 6
756 754 1.0 6
757 755 1.0 8
758 756 1.0 6
759 757 1.0 6
760 758 1.0 8
761 759 1.0 7
762 760 1.0 6
763 761 1.0 8
764 762 1.0 6
765 763 0.0 5
766 764 1.0 9
767 765 1.0 8
768 766 1.0 8
769 767 1.0 6
770 768 1.0 8
771 769 1.0 8
772 770 1.0 6
773 771 0.0 5
774 772 0.0 3
775 773 0.0 2
776 774 1.0 8
777 775 1.0 6
778 776 1.0 6
779 777 1.0 6
780 778 1.0 6
781 779 1.0 6
782 780 1.0 6
783 781 1.0 6
784 782 1.0 6
785 783 1.0 6
786 784 1.0 6
787 785 1.0 6
788 786 1.0 6
789 787 1.0 6
790 788 1.0 6
791 789 0.0 2
792 790 1.0 6
793 791 0.0 4
794 792 1.0 6
795 793 1.0 6
796 794 1.0 6
797 795 1.0 6
798 796 1.0 6
799 797 1.0 8
800 798 0.0 5
801 799 1.0 6

View File

@@ -1,6 +1,6 @@
{
"algo_name": "Q-learning",
"env_name": "FrozenLake-v1",
"env_name": "FrozenLakeNoSlippery-v1",
"train_eps": 800,
"test_eps": 20,
"gamma": 0.9,
@@ -12,8 +12,8 @@
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLake-v1/20220824-112735/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLake-v1/20220824-112735/models/",
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLakeNoSlippery-v1/20220825-114335/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLakeNoSlippery-v1/20220825-114335/models/",
"n_states": 16,
"n_actions": 4
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

View File

@@ -0,0 +1,801 @@
episodes,rewards,steps
0,0.0,20
1,0.0,14
2,0.0,13
3,0.0,9
4,0.0,10
5,0.0,6
6,0.0,11
7,0.0,6
8,0.0,3
9,0.0,9
10,0.0,11
11,0.0,22
12,0.0,5
13,0.0,16
14,0.0,4
15,0.0,9
16,0.0,18
17,0.0,2
18,0.0,4
19,0.0,8
20,0.0,7
21,0.0,4
22,0.0,22
23,0.0,15
24,0.0,5
25,0.0,16
26,0.0,7
27,0.0,19
28,0.0,22
29,0.0,16
30,0.0,11
31,0.0,22
32,0.0,28
33,0.0,23
34,0.0,4
35,0.0,11
36,0.0,8
37,0.0,15
38,0.0,5
39,0.0,7
40,0.0,9
41,0.0,4
42,0.0,3
43,0.0,6
44,0.0,41
45,0.0,9
46,0.0,23
47,0.0,3
48,1.0,38
49,0.0,29
50,0.0,17
51,0.0,4
52,0.0,2
53,0.0,25
54,0.0,6
55,0.0,2
56,0.0,30
57,0.0,6
58,0.0,7
59,0.0,11
60,0.0,9
61,0.0,8
62,0.0,23
63,0.0,10
64,0.0,3
65,0.0,5
66,0.0,7
67,0.0,18
68,0.0,8
69,0.0,26
70,0.0,6
71,0.0,14
72,0.0,4
73,0.0,25
74,0.0,21
75,0.0,13
76,0.0,4
77,0.0,29
78,0.0,21
79,0.0,6
80,0.0,6
81,0.0,11
82,0.0,21
83,0.0,9
84,0.0,9
85,0.0,7
86,0.0,48
87,0.0,23
88,0.0,160
89,0.0,7
90,0.0,10
91,0.0,24
92,0.0,4
93,0.0,7
94,0.0,17
95,0.0,87
96,0.0,28
97,0.0,7
98,0.0,5
99,0.0,12
100,0.0,14
101,0.0,6
102,0.0,13
103,0.0,93
104,0.0,4
105,0.0,50
106,0.0,8
107,0.0,12
108,0.0,43
109,0.0,30
110,0.0,15
111,0.0,19
112,0.0,182
113,0.0,40
114,0.0,88
115,0.0,19
116,0.0,30
117,0.0,27
118,0.0,5
119,0.0,87
120,0.0,9
121,0.0,64
122,0.0,27
123,0.0,68
124,0.0,81
125,0.0,86
126,0.0,227
127,0.0,41
128,0.0,70
129,0.0,27
130,0.0,6
131,0.0,18
132,0.0,38
133,0.0,26
134,0.0,36
135,0.0,3
136,0.0,61
137,0.0,105
138,0.0,38
139,0.0,18
140,0.0,33
141,0.0,29
142,0.0,49
143,0.0,88
144,0.0,22
145,0.0,65
146,0.0,36
147,0.0,30
148,0.0,58
149,0.0,43
150,0.0,53
151,0.0,43
152,0.0,13
153,0.0,8
154,0.0,39
155,0.0,29
156,0.0,26
157,0.0,60
158,0.0,153
159,0.0,116
160,0.0,53
161,0.0,54
162,0.0,8
163,0.0,58
164,0.0,3
165,0.0,47
166,0.0,16
167,0.0,21
168,0.0,44
169,0.0,29
170,0.0,104
171,0.0,158
172,0.0,83
173,0.0,26
174,0.0,24
175,0.0,10
176,0.0,12
177,0.0,40
178,0.0,25
179,0.0,18
180,0.0,60
181,0.0,203
182,0.0,23
183,0.0,54
184,0.0,71
185,0.0,19
186,0.0,118
187,0.0,26
188,0.0,41
189,0.0,41
190,0.0,60
191,0.0,31
192,0.0,34
193,0.0,35
194,0.0,59
195,0.0,51
196,0.0,426
197,0.0,79
198,0.0,40
199,0.0,17
200,0.0,79
201,0.0,126
202,0.0,61
203,0.0,25
204,0.0,18
205,0.0,27
206,0.0,13
207,0.0,187
208,0.0,160
209,0.0,32
210,0.0,108
211,0.0,164
212,0.0,17
213,0.0,82
214,0.0,194
215,0.0,7
216,0.0,36
217,0.0,156
218,0.0,17
219,0.0,183
220,0.0,243
221,0.0,87
222,0.0,42
223,0.0,80
224,0.0,54
225,0.0,82
226,0.0,97
227,0.0,65
228,0.0,83
229,0.0,159
230,0.0,178
231,0.0,104
232,0.0,21
233,0.0,118
234,0.0,80
235,0.0,170
236,0.0,94
237,0.0,235
238,0.0,13
239,0.0,31
240,0.0,134
241,0.0,32
242,0.0,58
243,0.0,38
244,0.0,28
245,0.0,159
246,0.0,182
247,0.0,51
248,0.0,25
249,0.0,73
250,0.0,56
251,0.0,55
252,0.0,38
253,0.0,292
254,0.0,319
255,0.0,100
256,0.0,84
257,0.0,24
258,0.0,17
259,0.0,159
260,0.0,25
261,0.0,73
262,0.0,130
263,0.0,111
264,0.0,65
265,1.0,58
266,0.0,47
267,0.0,48
268,0.0,13
269,0.0,100
270,0.0,38
271,0.0,111
272,0.0,226
273,0.0,38
274,0.0,83
275,0.0,42
276,0.0,199
277,0.0,83
278,0.0,28
279,0.0,46
280,0.0,262
281,0.0,123
282,0.0,91
283,0.0,53
284,0.0,19
285,0.0,26
286,0.0,93
287,0.0,38
288,0.0,22
289,0.0,43
290,0.0,163
291,0.0,25
292,0.0,59
293,0.0,71
294,0.0,20
295,0.0,115
296,0.0,248
297,0.0,66
298,0.0,58
299,0.0,129
300,0.0,122
301,0.0,47
302,0.0,60
303,0.0,79
304,1.0,137
305,0.0,27
306,1.0,93
307,0.0,46
308,1.0,83
309,1.0,8
310,1.0,6
311,1.0,6
312,0.0,4
313,1.0,6
314,0.0,2
315,1.0,6
316,1.0,6
317,1.0,6
318,1.0,6
319,1.0,8
320,0.0,5
321,1.0,6
322,1.0,7
323,0.0,5
324,1.0,6
325,1.0,6
326,1.0,8
327,1.0,6
328,1.0,6
329,1.0,6
330,1.0,7
331,1.0,6
332,1.0,6
333,0.0,3
334,1.0,7
335,0.0,4
336,1.0,6
337,1.0,6
338,1.0,7
339,1.0,6
340,1.0,6
341,1.0,7
342,1.0,7
343,1.0,7
344,1.0,6
345,1.0,6
346,1.0,6
347,1.0,6
348,1.0,6
349,1.0,6
350,1.0,6
351,1.0,7
352,0.0,4
353,1.0,8
354,1.0,8
355,1.0,7
356,1.0,6
357,1.0,8
358,1.0,6
359,1.0,6
360,1.0,7
361,1.0,6
362,1.0,6
363,1.0,8
364,1.0,7
365,1.0,6
366,1.0,6
367,0.0,3
368,1.0,11
369,1.0,6
370,1.0,8
371,0.0,2
372,1.0,6
373,1.0,6
374,1.0,6
375,1.0,6
376,1.0,8
377,1.0,6
378,1.0,7
379,1.0,6
380,1.0,7
381,1.0,6
382,1.0,8
383,0.0,2
384,1.0,6
385,1.0,7
386,1.0,6
387,1.0,6
388,1.0,10
389,1.0,7
390,1.0,6
391,1.0,6
392,1.0,6
393,1.0,6
394,1.0,6
395,1.0,7
396,0.0,4
397,1.0,7
398,1.0,6
399,1.0,8
400,0.0,3
401,1.0,6
402,1.0,6
403,1.0,6
404,1.0,6
405,0.0,2
406,1.0,6
407,1.0,6
408,1.0,6
409,1.0,6
410,1.0,6
411,1.0,7
412,1.0,6
413,1.0,6
414,1.0,7
415,1.0,6
416,1.0,6
417,1.0,6
418,1.0,6
419,1.0,6
420,1.0,6
421,1.0,6
422,1.0,8
423,1.0,6
424,1.0,8
425,1.0,7
426,1.0,6
427,0.0,3
428,1.0,6
429,1.0,7
430,1.0,6
431,1.0,6
432,1.0,6
433,1.0,10
434,1.0,6
435,1.0,6
436,1.0,6
437,1.0,6
438,1.0,10
439,1.0,6
440,1.0,8
441,1.0,8
442,1.0,7
443,1.0,6
444,0.0,5
445,0.0,2
446,1.0,8
447,1.0,6
448,1.0,10
449,1.0,6
450,1.0,8
451,1.0,10
452,1.0,6
453,1.0,6
454,1.0,6
455,1.0,10
456,1.0,6
457,0.0,4
458,1.0,6
459,1.0,6
460,1.0,6
461,1.0,15
462,1.0,6
463,1.0,6
464,1.0,6
465,1.0,6
466,1.0,6
467,1.0,6
468,1.0,8
469,1.0,6
470,1.0,7
471,1.0,6
472,1.0,6
473,1.0,8
474,1.0,6
475,1.0,6
476,1.0,8
477,1.0,8
478,1.0,6
479,1.0,6
480,1.0,6
481,1.0,10
482,1.0,6
483,1.0,6
484,1.0,6
485,1.0,6
486,1.0,6
487,1.0,6
488,1.0,6
489,1.0,8
490,1.0,8
491,1.0,6
492,1.0,6
493,0.0,2
494,1.0,6
495,1.0,6
496,1.0,6
497,1.0,8
498,1.0,6
499,1.0,6
500,1.0,6
501,1.0,6
502,1.0,6
503,1.0,6
504,1.0,6
505,1.0,6
506,1.0,6
507,1.0,7
508,0.0,3
509,1.0,7
510,1.0,6
511,1.0,6
512,1.0,6
513,0.0,2
514,1.0,6
515,1.0,8
516,1.0,6
517,1.0,6
518,1.0,6
519,1.0,6
520,1.0,9
521,1.0,6
522,1.0,6
523,1.0,6
524,1.0,6
525,1.0,6
526,1.0,6
527,1.0,9
528,1.0,7
529,0.0,4
530,1.0,6
531,1.0,8
532,1.0,11
533,1.0,6
534,1.0,6
535,1.0,6
536,1.0,6
537,1.0,6
538,1.0,8
539,1.0,6
540,1.0,6
541,1.0,8
542,1.0,7
543,1.0,6
544,1.0,8
545,1.0,6
546,0.0,5
547,1.0,9
548,1.0,8
549,1.0,8
550,1.0,6
551,1.0,8
552,1.0,8
553,1.0,6
554,0.0,5
555,0.0,3
556,0.0,2
557,1.0,8
558,1.0,6
559,1.0,6
560,1.0,6
561,1.0,6
562,1.0,6
563,1.0,6
564,1.0,6
565,1.0,6
566,1.0,6
567,1.0,6
568,1.0,6
569,1.0,6
570,1.0,6
571,1.0,6
572,0.0,2
573,1.0,6
574,0.0,4
575,1.0,6
576,1.0,6
577,1.0,6
578,1.0,6
579,1.0,6
580,1.0,8
581,0.0,5
582,1.0,6
583,1.0,6
584,1.0,6
585,1.0,6
586,1.0,6
587,1.0,6
588,0.0,3
589,1.0,6
590,1.0,6
591,1.0,6
592,0.0,2
593,1.0,6
594,0.0,4
595,1.0,6
596,1.0,6
597,1.0,6
598,1.0,6
599,1.0,8
600,1.0,6
601,1.0,7
602,1.0,6
603,1.0,7
604,1.0,6
605,0.0,2
606,1.0,6
607,1.0,6
608,0.0,5
609,0.0,3
610,0.0,3
611,1.0,6
612,0.0,5
613,1.0,8
614,1.0,8
615,1.0,6
616,1.0,6
617,1.0,7
618,1.0,6
619,1.0,6
620,1.0,6
621,1.0,6
622,1.0,6
623,1.0,8
624,0.0,2
625,1.0,6
626,1.0,6
627,1.0,6
628,1.0,6
629,1.0,6
630,1.0,6
631,1.0,6
632,1.0,8
633,1.0,6
634,1.0,8
635,1.0,6
636,1.0,6
637,1.0,8
638,1.0,8
639,0.0,5
640,0.0,4
641,0.0,4
642,1.0,6
643,1.0,6
644,1.0,6
645,1.0,6
646,1.0,8
647,1.0,6
648,0.0,4
649,1.0,6
650,1.0,8
651,1.0,6
652,1.0,6
653,1.0,6
654,1.0,6
655,1.0,6
656,1.0,6
657,1.0,6
658,1.0,8
659,1.0,8
660,1.0,6
661,1.0,8
662,1.0,9
663,1.0,6
664,1.0,6
665,1.0,6
666,1.0,6
667,1.0,10
668,1.0,6
669,1.0,6
670,1.0,6
671,1.0,11
672,1.0,10
673,1.0,8
674,1.0,6
675,1.0,6
676,1.0,6
677,0.0,5
678,1.0,6
679,0.0,2
680,1.0,9
681,1.0,6
682,1.0,8
683,1.0,7
684,1.0,6
685,1.0,6
686,1.0,7
687,0.0,3
688,1.0,7
689,0.0,2
690,1.0,6
691,1.0,6
692,1.0,8
693,1.0,8
694,1.0,6
695,1.0,6
696,0.0,2
697,1.0,8
698,1.0,6
699,1.0,8
700,1.0,6
701,1.0,6
702,1.0,9
703,1.0,6
704,1.0,8
705,1.0,11
706,1.0,6
707,1.0,6
708,1.0,6
709,1.0,6
710,1.0,8
711,1.0,6
712,1.0,6
713,1.0,6
714,0.0,5
715,1.0,6
716,1.0,6
717,1.0,6
718,1.0,6
719,1.0,6
720,1.0,7
721,1.0,6
722,1.0,6
723,1.0,6
724,1.0,6
725,1.0,10
726,1.0,6
727,1.0,6
728,1.0,6
729,1.0,6
730,1.0,6
731,1.0,7
732,1.0,6
733,1.0,8
734,1.0,7
735,1.0,6
736,1.0,6
737,1.0,14
738,1.0,6
739,1.0,6
740,1.0,12
741,1.0,6
742,1.0,6
743,1.0,6
744,1.0,6
745,1.0,6
746,1.0,6
747,0.0,3
748,1.0,6
749,1.0,6
750,1.0,6
751,1.0,7
752,1.0,6
753,1.0,6
754,1.0,6
755,1.0,8
756,0.0,2
757,1.0,6
758,1.0,6
759,1.0,6
760,1.0,6
761,1.0,6
762,1.0,6
763,1.0,6
764,1.0,6
765,1.0,6
766,0.0,4
767,1.0,8
768,1.0,6
769,0.0,2
770,1.0,10
771,1.0,8
772,1.0,6
773,1.0,6
774,1.0,6
775,0.0,3
776,1.0,6
777,1.0,6
778,0.0,6
779,1.0,8
780,1.0,6
781,1.0,9
782,1.0,6
783,1.0,6
784,1.0,8
785,1.0,8
786,1.0,6
787,0.0,5
788,1.0,6
789,1.0,6
790,1.0,6
791,1.0,6
792,1.0,6
793,1.0,6
794,1.0,8
795,1.0,6
796,0.0,2
797,1.0,8
798,1.0,7
799,1.0,6
1 episodes rewards steps
2 0 0.0 20
3 1 0.0 14
4 2 0.0 13
5 3 0.0 9
6 4 0.0 10
7 5 0.0 6
8 6 0.0 11
9 7 0.0 6
10 8 0.0 3
11 9 0.0 9
12 10 0.0 11
13 11 0.0 22
14 12 0.0 5
15 13 0.0 16
16 14 0.0 4
17 15 0.0 9
18 16 0.0 18
19 17 0.0 2
20 18 0.0 4
21 19 0.0 8
22 20 0.0 7
23 21 0.0 4
24 22 0.0 22
25 23 0.0 15
26 24 0.0 5
27 25 0.0 16
28 26 0.0 7
29 27 0.0 19
30 28 0.0 22
31 29 0.0 16
32 30 0.0 11
33 31 0.0 22
34 32 0.0 28
35 33 0.0 23
36 34 0.0 4
37 35 0.0 11
38 36 0.0 8
39 37 0.0 15
40 38 0.0 5
41 39 0.0 7
42 40 0.0 9
43 41 0.0 4
44 42 0.0 3
45 43 0.0 6
46 44 0.0 41
47 45 0.0 9
48 46 0.0 23
49 47 0.0 3
50 48 1.0 38
51 49 0.0 29
52 50 0.0 17
53 51 0.0 4
54 52 0.0 2
55 53 0.0 25
56 54 0.0 6
57 55 0.0 2
58 56 0.0 30
59 57 0.0 6
60 58 0.0 7
61 59 0.0 11
62 60 0.0 9
63 61 0.0 8
64 62 0.0 23
65 63 0.0 10
66 64 0.0 3
67 65 0.0 5
68 66 0.0 7
69 67 0.0 18
70 68 0.0 8
71 69 0.0 26
72 70 0.0 6
73 71 0.0 14
74 72 0.0 4
75 73 0.0 25
76 74 0.0 21
77 75 0.0 13
78 76 0.0 4
79 77 0.0 29
80 78 0.0 21
81 79 0.0 6
82 80 0.0 6
83 81 0.0 11
84 82 0.0 21
85 83 0.0 9
86 84 0.0 9
87 85 0.0 7
88 86 0.0 48
89 87 0.0 23
90 88 0.0 160
91 89 0.0 7
92 90 0.0 10
93 91 0.0 24
94 92 0.0 4
95 93 0.0 7
96 94 0.0 17
97 95 0.0 87
98 96 0.0 28
99 97 0.0 7
100 98 0.0 5
101 99 0.0 12
102 100 0.0 14
103 101 0.0 6
104 102 0.0 13
105 103 0.0 93
106 104 0.0 4
107 105 0.0 50
108 106 0.0 8
109 107 0.0 12
110 108 0.0 43
111 109 0.0 30
112 110 0.0 15
113 111 0.0 19
114 112 0.0 182
115 113 0.0 40
116 114 0.0 88
117 115 0.0 19
118 116 0.0 30
119 117 0.0 27
120 118 0.0 5
121 119 0.0 87
122 120 0.0 9
123 121 0.0 64
124 122 0.0 27
125 123 0.0 68
126 124 0.0 81
127 125 0.0 86
128 126 0.0 227
129 127 0.0 41
130 128 0.0 70
131 129 0.0 27
132 130 0.0 6
133 131 0.0 18
134 132 0.0 38
135 133 0.0 26
136 134 0.0 36
137 135 0.0 3
138 136 0.0 61
139 137 0.0 105
140 138 0.0 38
141 139 0.0 18
142 140 0.0 33
143 141 0.0 29
144 142 0.0 49
145 143 0.0 88
146 144 0.0 22
147 145 0.0 65
148 146 0.0 36
149 147 0.0 30
150 148 0.0 58
151 149 0.0 43
152 150 0.0 53
153 151 0.0 43
154 152 0.0 13
155 153 0.0 8
156 154 0.0 39
157 155 0.0 29
158 156 0.0 26
159 157 0.0 60
160 158 0.0 153
161 159 0.0 116
162 160 0.0 53
163 161 0.0 54
164 162 0.0 8
165 163 0.0 58
166 164 0.0 3
167 165 0.0 47
168 166 0.0 16
169 167 0.0 21
170 168 0.0 44
171 169 0.0 29
172 170 0.0 104
173 171 0.0 158
174 172 0.0 83
175 173 0.0 26
176 174 0.0 24
177 175 0.0 10
178 176 0.0 12
179 177 0.0 40
180 178 0.0 25
181 179 0.0 18
182 180 0.0 60
183 181 0.0 203
184 182 0.0 23
185 183 0.0 54
186 184 0.0 71
187 185 0.0 19
188 186 0.0 118
189 187 0.0 26
190 188 0.0 41
191 189 0.0 41
192 190 0.0 60
193 191 0.0 31
194 192 0.0 34
195 193 0.0 35
196 194 0.0 59
197 195 0.0 51
198 196 0.0 426
199 197 0.0 79
200 198 0.0 40
201 199 0.0 17
202 200 0.0 79
203 201 0.0 126
204 202 0.0 61
205 203 0.0 25
206 204 0.0 18
207 205 0.0 27
208 206 0.0 13
209 207 0.0 187
210 208 0.0 160
211 209 0.0 32
212 210 0.0 108
213 211 0.0 164
214 212 0.0 17
215 213 0.0 82
216 214 0.0 194
217 215 0.0 7
218 216 0.0 36
219 217 0.0 156
220 218 0.0 17
221 219 0.0 183
222 220 0.0 243
223 221 0.0 87
224 222 0.0 42
225 223 0.0 80
226 224 0.0 54
227 225 0.0 82
228 226 0.0 97
229 227 0.0 65
230 228 0.0 83
231 229 0.0 159
232 230 0.0 178
233 231 0.0 104
234 232 0.0 21
235 233 0.0 118
236 234 0.0 80
237 235 0.0 170
238 236 0.0 94
239 237 0.0 235
240 238 0.0 13
241 239 0.0 31
242 240 0.0 134
243 241 0.0 32
244 242 0.0 58
245 243 0.0 38
246 244 0.0 28
247 245 0.0 159
248 246 0.0 182
249 247 0.0 51
250 248 0.0 25
251 249 0.0 73
252 250 0.0 56
253 251 0.0 55
254 252 0.0 38
255 253 0.0 292
256 254 0.0 319
257 255 0.0 100
258 256 0.0 84
259 257 0.0 24
260 258 0.0 17
261 259 0.0 159
262 260 0.0 25
263 261 0.0 73
264 262 0.0 130
265 263 0.0 111
266 264 0.0 65
267 265 1.0 58
268 266 0.0 47
269 267 0.0 48
270 268 0.0 13
271 269 0.0 100
272 270 0.0 38
273 271 0.0 111
274 272 0.0 226
275 273 0.0 38
276 274 0.0 83
277 275 0.0 42
278 276 0.0 199
279 277 0.0 83
280 278 0.0 28
281 279 0.0 46
282 280 0.0 262
283 281 0.0 123
284 282 0.0 91
285 283 0.0 53
286 284 0.0 19
287 285 0.0 26
288 286 0.0 93
289 287 0.0 38
290 288 0.0 22
291 289 0.0 43
292 290 0.0 163
293 291 0.0 25
294 292 0.0 59
295 293 0.0 71
296 294 0.0 20
297 295 0.0 115
298 296 0.0 248
299 297 0.0 66
300 298 0.0 58
301 299 0.0 129
302 300 0.0 122
303 301 0.0 47
304 302 0.0 60
305 303 0.0 79
306 304 1.0 137
307 305 0.0 27
308 306 1.0 93
309 307 0.0 46
310 308 1.0 83
311 309 1.0 8
312 310 1.0 6
313 311 1.0 6
314 312 0.0 4
315 313 1.0 6
316 314 0.0 2
317 315 1.0 6
318 316 1.0 6
319 317 1.0 6
320 318 1.0 6
321 319 1.0 8
322 320 0.0 5
323 321 1.0 6
324 322 1.0 7
325 323 0.0 5
326 324 1.0 6
327 325 1.0 6
328 326 1.0 8
329 327 1.0 6
330 328 1.0 6
331 329 1.0 6
332 330 1.0 7
333 331 1.0 6
334 332 1.0 6
335 333 0.0 3
336 334 1.0 7
337 335 0.0 4
338 336 1.0 6
339 337 1.0 6
340 338 1.0 7
341 339 1.0 6
342 340 1.0 6
343 341 1.0 7
344 342 1.0 7
345 343 1.0 7
346 344 1.0 6
347 345 1.0 6
348 346 1.0 6
349 347 1.0 6
350 348 1.0 6
351 349 1.0 6
352 350 1.0 6
353 351 1.0 7
354 352 0.0 4
355 353 1.0 8
356 354 1.0 8
357 355 1.0 7
358 356 1.0 6
359 357 1.0 8
360 358 1.0 6
361 359 1.0 6
362 360 1.0 7
363 361 1.0 6
364 362 1.0 6
365 363 1.0 8
366 364 1.0 7
367 365 1.0 6
368 366 1.0 6
369 367 0.0 3
370 368 1.0 11
371 369 1.0 6
372 370 1.0 8
373 371 0.0 2
374 372 1.0 6
375 373 1.0 6
376 374 1.0 6
377 375 1.0 6
378 376 1.0 8
379 377 1.0 6
380 378 1.0 7
381 379 1.0 6
382 380 1.0 7
383 381 1.0 6
384 382 1.0 8
385 383 0.0 2
386 384 1.0 6
387 385 1.0 7
388 386 1.0 6
389 387 1.0 6
390 388 1.0 10
391 389 1.0 7
392 390 1.0 6
393 391 1.0 6
394 392 1.0 6
395 393 1.0 6
396 394 1.0 6
397 395 1.0 7
398 396 0.0 4
399 397 1.0 7
400 398 1.0 6
401 399 1.0 8
402 400 0.0 3
403 401 1.0 6
404 402 1.0 6
405 403 1.0 6
406 404 1.0 6
407 405 0.0 2
408 406 1.0 6
409 407 1.0 6
410 408 1.0 6
411 409 1.0 6
412 410 1.0 6
413 411 1.0 7
414 412 1.0 6
415 413 1.0 6
416 414 1.0 7
417 415 1.0 6
418 416 1.0 6
419 417 1.0 6
420 418 1.0 6
421 419 1.0 6
422 420 1.0 6
423 421 1.0 6
424 422 1.0 8
425 423 1.0 6
426 424 1.0 8
427 425 1.0 7
428 426 1.0 6
429 427 0.0 3
430 428 1.0 6
431 429 1.0 7
432 430 1.0 6
433 431 1.0 6
434 432 1.0 6
435 433 1.0 10
436 434 1.0 6
437 435 1.0 6
438 436 1.0 6
439 437 1.0 6
440 438 1.0 10
441 439 1.0 6
442 440 1.0 8
443 441 1.0 8
444 442 1.0 7
445 443 1.0 6
446 444 0.0 5
447 445 0.0 2
448 446 1.0 8
449 447 1.0 6
450 448 1.0 10
451 449 1.0 6
452 450 1.0 8
453 451 1.0 10
454 452 1.0 6
455 453 1.0 6
456 454 1.0 6
457 455 1.0 10
458 456 1.0 6
459 457 0.0 4
460 458 1.0 6
461 459 1.0 6
462 460 1.0 6
463 461 1.0 15
464 462 1.0 6
465 463 1.0 6
466 464 1.0 6
467 465 1.0 6
468 466 1.0 6
469 467 1.0 6
470 468 1.0 8
471 469 1.0 6
472 470 1.0 7
473 471 1.0 6
474 472 1.0 6
475 473 1.0 8
476 474 1.0 6
477 475 1.0 6
478 476 1.0 8
479 477 1.0 8
480 478 1.0 6
481 479 1.0 6
482 480 1.0 6
483 481 1.0 10
484 482 1.0 6
485 483 1.0 6
486 484 1.0 6
487 485 1.0 6
488 486 1.0 6
489 487 1.0 6
490 488 1.0 6
491 489 1.0 8
492 490 1.0 8
493 491 1.0 6
494 492 1.0 6
495 493 0.0 2
496 494 1.0 6
497 495 1.0 6
498 496 1.0 6
499 497 1.0 8
500 498 1.0 6
501 499 1.0 6
502 500 1.0 6
503 501 1.0 6
504 502 1.0 6
505 503 1.0 6
506 504 1.0 6
507 505 1.0 6
508 506 1.0 6
509 507 1.0 7
510 508 0.0 3
511 509 1.0 7
512 510 1.0 6
513 511 1.0 6
514 512 1.0 6
515 513 0.0 2
516 514 1.0 6
517 515 1.0 8
518 516 1.0 6
519 517 1.0 6
520 518 1.0 6
521 519 1.0 6
522 520 1.0 9
523 521 1.0 6
524 522 1.0 6
525 523 1.0 6
526 524 1.0 6
527 525 1.0 6
528 526 1.0 6
529 527 1.0 9
530 528 1.0 7
531 529 0.0 4
532 530 1.0 6
533 531 1.0 8
534 532 1.0 11
535 533 1.0 6
536 534 1.0 6
537 535 1.0 6
538 536 1.0 6
539 537 1.0 6
540 538 1.0 8
541 539 1.0 6
542 540 1.0 6
543 541 1.0 8
544 542 1.0 7
545 543 1.0 6
546 544 1.0 8
547 545 1.0 6
548 546 0.0 5
549 547 1.0 9
550 548 1.0 8
551 549 1.0 8
552 550 1.0 6
553 551 1.0 8
554 552 1.0 8
555 553 1.0 6
556 554 0.0 5
557 555 0.0 3
558 556 0.0 2
559 557 1.0 8
560 558 1.0 6
561 559 1.0 6
562 560 1.0 6
563 561 1.0 6
564 562 1.0 6
565 563 1.0 6
566 564 1.0 6
567 565 1.0 6
568 566 1.0 6
569 567 1.0 6
570 568 1.0 6
571 569 1.0 6
572 570 1.0 6
573 571 1.0 6
574 572 0.0 2
575 573 1.0 6
576 574 0.0 4
577 575 1.0 6
578 576 1.0 6
579 577 1.0 6
580 578 1.0 6
581 579 1.0 6
582 580 1.0 8
583 581 0.0 5
584 582 1.0 6
585 583 1.0 6
586 584 1.0 6
587 585 1.0 6
588 586 1.0 6
589 587 1.0 6
590 588 0.0 3
591 589 1.0 6
592 590 1.0 6
593 591 1.0 6
594 592 0.0 2
595 593 1.0 6
596 594 0.0 4
597 595 1.0 6
598 596 1.0 6
599 597 1.0 6
600 598 1.0 6
601 599 1.0 8
602 600 1.0 6
603 601 1.0 7
604 602 1.0 6
605 603 1.0 7
606 604 1.0 6
607 605 0.0 2
608 606 1.0 6
609 607 1.0 6
610 608 0.0 5
611 609 0.0 3
612 610 0.0 3
613 611 1.0 6
614 612 0.0 5
615 613 1.0 8
616 614 1.0 8
617 615 1.0 6
618 616 1.0 6
619 617 1.0 7
620 618 1.0 6
621 619 1.0 6
622 620 1.0 6
623 621 1.0 6
624 622 1.0 6
625 623 1.0 8
626 624 0.0 2
627 625 1.0 6
628 626 1.0 6
629 627 1.0 6
630 628 1.0 6
631 629 1.0 6
632 630 1.0 6
633 631 1.0 6
634 632 1.0 8
635 633 1.0 6
636 634 1.0 8
637 635 1.0 6
638 636 1.0 6
639 637 1.0 8
640 638 1.0 8
641 639 0.0 5
642 640 0.0 4
643 641 0.0 4
644 642 1.0 6
645 643 1.0 6
646 644 1.0 6
647 645 1.0 6
648 646 1.0 8
649 647 1.0 6
650 648 0.0 4
651 649 1.0 6
652 650 1.0 8
653 651 1.0 6
654 652 1.0 6
655 653 1.0 6
656 654 1.0 6
657 655 1.0 6
658 656 1.0 6
659 657 1.0 6
660 658 1.0 8
661 659 1.0 8
662 660 1.0 6
663 661 1.0 8
664 662 1.0 9
665 663 1.0 6
666 664 1.0 6
667 665 1.0 6
668 666 1.0 6
669 667 1.0 10
670 668 1.0 6
671 669 1.0 6
672 670 1.0 6
673 671 1.0 11
674 672 1.0 10
675 673 1.0 8
676 674 1.0 6
677 675 1.0 6
678 676 1.0 6
679 677 0.0 5
680 678 1.0 6
681 679 0.0 2
682 680 1.0 9
683 681 1.0 6
684 682 1.0 8
685 683 1.0 7
686 684 1.0 6
687 685 1.0 6
688 686 1.0 7
689 687 0.0 3
690 688 1.0 7
691 689 0.0 2
692 690 1.0 6
693 691 1.0 6
694 692 1.0 8
695 693 1.0 8
696 694 1.0 6
697 695 1.0 6
698 696 0.0 2
699 697 1.0 8
700 698 1.0 6
701 699 1.0 8
702 700 1.0 6
703 701 1.0 6
704 702 1.0 9
705 703 1.0 6
706 704 1.0 8
707 705 1.0 11
708 706 1.0 6
709 707 1.0 6
710 708 1.0 6
711 709 1.0 6
712 710 1.0 8
713 711 1.0 6
714 712 1.0 6
715 713 1.0 6
716 714 0.0 5
717 715 1.0 6
718 716 1.0 6
719 717 1.0 6
720 718 1.0 6
721 719 1.0 6
722 720 1.0 7
723 721 1.0 6
724 722 1.0 6
725 723 1.0 6
726 724 1.0 6
727 725 1.0 10
728 726 1.0 6
729 727 1.0 6
730 728 1.0 6
731 729 1.0 6
732 730 1.0 6
733 731 1.0 7
734 732 1.0 6
735 733 1.0 8
736 734 1.0 7
737 735 1.0 6
738 736 1.0 6
739 737 1.0 14
740 738 1.0 6
741 739 1.0 6
742 740 1.0 12
743 741 1.0 6
744 742 1.0 6
745 743 1.0 6
746 744 1.0 6
747 745 1.0 6
748 746 1.0 6
749 747 0.0 3
750 748 1.0 6
751 749 1.0 6
752 750 1.0 6
753 751 1.0 7
754 752 1.0 6
755 753 1.0 6
756 754 1.0 6
757 755 1.0 8
758 756 0.0 2
759 757 1.0 6
760 758 1.0 6
761 759 1.0 6
762 760 1.0 6
763 761 1.0 6
764 762 1.0 6
765 763 1.0 6
766 764 1.0 6
767 765 1.0 6
768 766 0.0 4
769 767 1.0 8
770 768 1.0 6
771 769 0.0 2
772 770 1.0 10
773 771 1.0 8
774 772 1.0 6
775 773 1.0 6
776 774 1.0 6
777 775 0.0 3
778 776 1.0 6
779 777 1.0 6
780 778 0.0 6
781 779 1.0 8
782 780 1.0 6
783 781 1.0 9
784 782 1.0 6
785 783 1.0 6
786 784 1.0 8
787 785 1.0 8
788 786 1.0 6
789 787 0.0 5
790 788 1.0 6
791 789 1.0 6
792 790 1.0 6
793 791 1.0 6
794 792 1.0 6
795 793 1.0 6
796 794 1.0 8
797 795 1.0 6
798 796 0.0 2
799 797 1.0 8
800 798 1.0 7
801 799 1.0 6

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-11 17:59:16
LastEditor: John
LastEditTime: 2022-08-25 14:26:36
Discription:
Environment:
'''
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import gym
import datetime
import argparse
from envs.register import register_env
from envs.wrappers import CliffWalkingWapper
from Sarsa.sarsa import Sarsa
from common.utils import save_results,make_dir,plot_rewards,save_args,all_seed
def get_args():
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='Racetrack-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.90,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=200,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.2,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(cfg):
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed= cfg['seed'])
if cfg['env_name'] == 'CliffWalking-v0':
env = CliffWalkingWapper(env)
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
agent = Sarsa(cfg)
return env,agent
def train(cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
action = agent.sample_action(state)
while True:
# for _ in range(cfg.ep_max_steps):
next_state, reward, done, _ = env.step(action) # update env and return transitions
next_action = agent.sample_action(next_state)
agent.update(state, action, reward, next_state, next_action,done) # update agent
state = next_state # update state
action = next_action
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
while True:
# for _ in range(cfg.ep_max_steps):
action = agent.predict_action(state)
next_state, reward, done = env.step(action)
state = next_state
ep_reward+=reward
ep_step+=1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-12 16:58:16
LastEditor: John
LastEditTime: 2022-08-04 22:22:16
LastEditTime: 2022-08-25 00:23:22
Discription:
Environment:
'''
@@ -14,45 +14,51 @@ from collections import defaultdict
import torch
import math
class Sarsa(object):
def __init__(self,
n_actions,cfg):
self.n_actions = n_actions
self.lr = cfg.lr
self.gamma = cfg.gamma
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.Q = defaultdict(lambda: np.zeros(n_actions)) # Q table
def sample(self, state):
def __init__(self,cfg):
self.n_actions = cfg['n_actions']
self.lr = cfg['lr']
self.gamma = cfg['gamma']
self.epsilon = cfg['epsilon_start']
self.sample_count = 0
self.epsilon_start = cfg['epsilon_start']
self.epsilon_end = cfg['epsilon_end']
self.epsilon_decay = cfg['epsilon_decay']
self.Q_table = defaultdict(lambda: np.zeros(self.n_actions)) # Q table
def sample_action(self, state):
''' another way to represent e-greedy policy
'''
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay) # The probability to select a random action, is is log decayed
best_action = np.argmax(self.Q[state])
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
return action
def predict(self,state):
return np.argmax(self.Q[state])
def update(self, state, action, reward, next_state, next_action,done):
Q_predict = self.Q[state][action]
if done:
Q_target = reward # 终止状态
else:
Q_target = reward + self.gamma * self.Q[next_state][next_action] # 与Q learning不同Sarsa是拿下一步动作对应的Q值去更新
self.Q[state][action] += self.lr * (Q_target - Q_predict)
def save(self,path):
'''把 Q表格 的数据保存到文件中
def predict_action(self,state):
''' predict action while testing
'''
action = np.argmax(self.Q_table[state])
return action
def update(self, state, action, reward, next_state, next_action,done):
Q_predict = self.Q_table[state][action]
if done:
Q_target = reward # terminal state
else:
Q_target = reward + self.gamma * self.Q_table[next_state][next_action] # the only difference from Q learning
self.Q_table[state][action] += self.lr * (Q_target - Q_predict)
def save_model(self,path):
import dill
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(
obj=self.Q,
f=path+"sarsa_model.pkl",
obj=self.Q_table_table,
f=path+"checkpoint.pkl",
pickle_module=dill
)
def load(self, path):
'''从文件中读取数据到 Q表格
'''
print("Model saved!")
def load_model(self, path):
import dill
self.Q =torch.load(f=path+'sarsa_model.pkl',pickle_module=dill)
self.Q_table_table =torch.load(f=path+'checkpoint.pkl',pickle_module=dill)
print("Mode loaded!")

View File

@@ -1,118 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-11 17:59:16
LastEditor: John
LastEditTime: 2022-08-04 22:28:51
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import datetime
import argparse
from envs.racetrack_env import RacetrackEnv
from Sarsa.sarsa import Sarsa
from common.utils import save_results,make_dir,plot_rewards,save_args
def get_args():
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") # 训练的回合数
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
parser.add_argument('--ep_max_steps',default=200,type=int) # 每回合最大的部署
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") # 折扣因子
parser.add_argument('--epsilon_start',default=0.90,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
parser.add_argument('--epsilon_decay',default=200,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
parser.add_argument('--lr',default=0.2,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
n_actions = 9 # 动作数
agent = Sarsa(n_actions,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
action = agent.sample(state)
ep_reward = 0
# while True:
for _ in range(cfg.ep_max_steps):
next_state, reward, done = env.step(action)
ep_reward+=reward
next_action = agent.sample(next_state)
agent.update(state, action, reward, next_state, next_action,done)
state = next_state
action = next_action
if done:
break
rewards.append(ep_reward)
if (i_ep+1)%2==0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print('完成训练!')
return {"rewards":rewards}
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
# while True:
for _ in range(cfg.ep_max_steps):
action = agent.predict(state)
next_state, reward, done = env.step(action)
ep_reward+=reward
state = next_state
if done:
break
rewards.append(ep_reward)
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return {"rewards":rewards}
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -0,0 +1,32 @@
from common.utils import save_args,save_results,plot_rewards
class Launcher:
def __init__(self) -> None:
pass
def get_args(self):
cfg = {}
return cfg
def env_agent_config(self,cfg):
env,agent = None,None
return env,agent
def train(self,cfg, env, agent):
res_dic = {}
return res_dic
def test(self,cfg, env, agent):
res_dic = {}
return res_dic
def run(self):
cfg = self.get_args()
env, agent = self.env_agent_config(cfg)
res_dic = self.train(cfg, env, agent)
save_args(cfg,path = cfg['result_path']) # save parameters
agent.save_model(path = cfg['model_path']) # save models
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
# testing
env, agent = self.env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg['model_path']) # load model
res_dic = self.test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg['result_path'])
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")

View File

@@ -72,84 +72,6 @@ class FrozenLakeWapper(gym.Wrapper):
self.move_player(x_pos, y_pos)
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
if __name__ == '__main__':
# 环境1FrozenLake, 可以配置冰面是否是滑的

View File

@@ -1,10 +1,3 @@
# Please do not make changes to this file - it will be overwritten with a clean
# version when your work is marked.
#
# This file contains code for the racetrack environment that you will be using
# as part of the second part of the CM50270: Reinforcement Learning coursework.
import imp
import time
import random
import numpy as np
@@ -12,23 +5,20 @@ import os
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
from IPython.display import clear_output
from gym.spaces import Discrete
from gym.spaces import Discrete,Box
from matplotlib import colors
import gym
class RacetrackEnv(object) :
class RacetrackEnv(gym.Env) :
"""
Class representing a race-track environment inspired by exercise 5.12 in Sutton & Barto 2018 (p.111).
Please do not make changes to this class - it will be overwritten with a clean version when it comes to marking.
The dynamics of this environment are detailed in this coursework exercise's jupyter notebook, although I have
included rather verbose comments here for those of you who are interested in how the environment has been
implemented (though this should not impact your solution code).
If you find any *bugs* with this code, please let me know immediately - thank you for finding them, sorry that I didn't!
However, please do not suggest optimisations - some things have been purposely simplified for readability's sake.
implemented (though this should not impact your solution code).ss
"""
ACTIONS_DICT = {
0 : (1, -1), # Acc Vert., Brake Horiz.
1 : (1, 0), # Acc Vert., Hold Horiz.
@@ -61,18 +51,15 @@ class RacetrackEnv(object) :
for x in range(self.track.shape[1]) :
if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
self.initial_states.append((y, x))
high= np.array([np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max])
self.observation_space = Box(low=-high, high=high, shape=(4,), dtype=np.float32)
self.action_space = Discrete(9)
self.is_reset = False
#print("Racetrack Environment File Loaded Successfully.")
#print("Be sure to call .reset() before starting to initialise the environment and get an initial state!")
def step(self, action : int) :
"""
Takes a given action in the environment's current state, and returns a next state,
reward, and whether the next state is terminal or not.
reward, and whether the next state is done or not.
Arguments:
action {int} -- The action to take in the environment's current state. Should be an integer in the range [0-8].
@@ -86,7 +73,7 @@ class RacetrackEnv(object) :
A tuple of:\n
{(int, int, int, int)} -- The next state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).\n
{int} -- The reward earned by taking the given action in the current environment state.\n
{bool} -- Whether the environment's next state is terminal or not.\n
{bool} -- Whether the environment's next state is done or not.\n
"""
@@ -131,7 +118,7 @@ class RacetrackEnv(object) :
new_position = (self.position[0] + self.velocity[0], self.position[1] + self.velocity[1])
reward = 0
terminal = False
done = False
# If position is out-of-bounds, return to start and set velocity components to zero.
if (new_position[0] < 0 or new_position[1] < 0 or new_position[0] >= self.track.shape[0] or new_position[1] >= self.track.shape[1]) :
@@ -150,7 +137,7 @@ class RacetrackEnv(object) :
elif (self.CELL_TYPES_DICT[self.track[new_position]] == "goal") :
self.position = new_position
reward += 10
terminal = True
done = True
# If this gets reached, then the student has touched something they shouldn't have. Naughty!
else :
raise RuntimeError("You've met with a terrible fate, haven't you?\nDon't modify things you shouldn't!")
@@ -158,12 +145,12 @@ class RacetrackEnv(object) :
# Penalise every timestep.
reward -= 1
# Require a reset if the current state is terminal.
if (terminal) :
# Require a reset if the current state is done.
if (done) :
self.is_reset = False
# Return next state, reward, and whether the episode has ended.
return (self.position[0], self.position[1], self.velocity[0], self.velocity[1]), reward, terminal
return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]]), reward, done,{}
def reset(self) :
@@ -184,10 +171,10 @@ class RacetrackEnv(object) :
self.is_reset = True
return (self.position[0], self.position[1], self.velocity[0], self.velocity[1])
return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]])
def render(self, sleep_time : float = 0.1) :
def render(self, mode = 'human') :
"""
Renders a pretty matplotlib plot representing the current state of the environment.
Calling this method on subsequent timesteps will update the plot.
@@ -230,13 +217,9 @@ class RacetrackEnv(object) :
# Draw everything.
#fig.canvas.draw()
#fig.canvas.flush_events()
plt.show()
# Sleep if desired.
if (sleep_time > 0) :
time.sleep(sleep_time)
# time sleep
time.sleep(0.1)
def get_actions(self) :
"""
@@ -244,18 +227,16 @@ class RacetrackEnv(object) :
of integers in the range [0-8].
"""
return [*self.ACTIONS_DICT]
if __name__ == "__main__":
num_steps = 1000000
env = RacetrackEnv()
state = env.reset()
print(state)
for _ in range(num_steps) :
# num_steps = 1000000
next_state, reward, done,_ = env.step(random.choice(env.get_actions()))
print(next_state)
env.render()
# env = RacetrackEnv()
# state = env.reset()
# print(state)
# for _ in range(num_steps) :
# next_state, reward, terminal = env.step(random.choice(env.get_actions()))
# print(next_state)
# env.render()
# if (terminal) :
# _ = env.reset()
if (done) :
_ = env.reset()

View File

@@ -0,0 +1,34 @@
from gym.envs.registration import register
def register_env(env_name):
if env_name == 'Racetrack-v0':
register(
id='Racetrack-v0',
entry_point='racetrack:RacetrackEnv',
max_episode_steps=1000,
kwargs={}
)
elif env_name == 'FrozenLakeNoSlippery-v1':
register(
id='FrozenLakeNoSlippery-v1',
entry_point='gym.envs.toy_text.frozen_lake:FrozenLakeEnv',
kwargs={'map_name':"4x4",'is_slippery':False},
)
else:
print("The env name must be wrong or the environment donot need to register!")
# if __name__ == "__main__":
# import random
# import gym
# env = gym.make('FrozenLakeNoSlippery-v1')
# num_steps = 1000000
# state = env.reset()
# n_actions = env.action_space.n
# print(state)
# for _ in range(num_steps) :
# next_state, reward, done,_ = env.step(random.choice(range(n_actions)))
# print(next_state)
# if (done) :
# _ = env.reset()

View File

@@ -0,0 +1,78 @@
import gym
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)

View File

@@ -11,4 +11,5 @@ else
fi
conda activate easyrl # easyrl here can be changed to another name of conda env that you have created
codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path
python $codes_dir/QLearning/main.py --env_name FrozenLake-v1 --train_eps 800 --epsilon_start 0.70 --epsilon_end 0.1 --epsilon_decay 2000 --gamma 0.9 --lr 0.9 --device cpu
python $codes_dir/envs/register.py # register environment
python $codes_dir/QLearning/main.py --env_name FrozenLakeNoSlippery-v1 --train_eps 800 --epsilon_start 0.70 --epsilon_end 0.1 --epsilon_decay 2000 --gamma 0.9 --lr 0.9 --device cpu

View File

@@ -0,0 +1,13 @@
if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then
echo "source file at ~/anaconda3/etc/profile.d/conda.sh"
source ~/anaconda3/etc/profile.d/conda.sh
elif [ -f "$HOME/opt/anaconda3/etc/profile.d/conda.sh" ]; then
echo "source file at ~/opt/anaconda3/etc/profile.d/conda.sh"
source ~/opt/anaconda3/etc/profile.d/conda.sh
else
echo 'please manually config the conda source path'
fi
conda activate easyrl # easyrl here can be changed to another name of conda env that you have created
codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path
python $codes_dir/envs/register.py # register environment
python $codes_dir/Sarsa/main.py