Merge branch 'master' of github.com:datawhalechina/easy-rl

This commit is contained in:
qiwang067
2022-09-12 22:20:04 +08:00
150 changed files with 10271 additions and 3582 deletions

7
projects/.gitignore vendored
View File

@@ -2,4 +2,9 @@
.ipynb_checkpoints
__pycache__
.vscode
test.py
test.py
pseudocodes.aux
pseudocodes.log
pseudocodes.synctex.gz
pseudocodes.out
pseudocodes.toc

View File

@@ -1,318 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 定义模型\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import paddle\n",
"import paddle.nn as nn\n",
"import paddle.nn.functional as F\n",
"import parl\n",
"\n",
"class CartpoleModel(parl.Model):\n",
" \"\"\" Linear network to solve Cartpole problem.\n",
" Args:\n",
" n_states (int): Dimension of observation space.\n",
" n_actions (int): Dimension of action space.\n",
" \"\"\"\n",
"\n",
" def __init__(self, n_states, n_actions):\n",
" super(CartpoleModel, self).__init__()\n",
" hid1_size = 128\n",
" hid2_size = 128\n",
" self.fc1 = nn.Linear(n_states, hid1_size)\n",
" self.fc2 = nn.Linear(hid1_size, hid2_size)\n",
" self.fc3 = nn.Linear(hid2_size, n_actions)\n",
"\n",
" def forward(self, obs):\n",
" h1 = F.relu(self.fc1(obs))\n",
" h2 = F.relu(self.fc2(h1))\n",
" Q = self.fc3(h2)\n",
" return Q"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import parl\n",
"import paddle\n",
"import numpy as np\n",
"\n",
"\n",
"class CartpoleAgent(parl.Agent):\n",
" \"\"\"Agent of Cartpole env.\n",
" Args:\n",
" algorithm(parl.Algorithm): algorithm used to solve the problem.\n",
" \"\"\"\n",
"\n",
" def __init__(self, algorithm, n_actions, e_greed=0.1, e_greed_decrement=0):\n",
" super(CartpoleAgent, self).__init__(algorithm)\n",
" assert isinstance(n_actions, int)\n",
" self.n_actions = n_actions\n",
"\n",
" self.global_step = 0\n",
" self.update_target_steps = 200\n",
"\n",
" self.e_greed = e_greed\n",
" self.e_greed_decrement = e_greed_decrement\n",
"\n",
" def sample(self, obs):\n",
" \"\"\"Sample an action `for exploration` when given an observation\n",
" Args:\n",
" obs(np.float32): shape of (n_states,)\n",
" Returns:\n",
" act(int): action\n",
" \"\"\"\n",
" sample = np.random.random()\n",
" if sample < self.e_greed:\n",
" act = np.random.randint(self.n_actions)\n",
" else:\n",
" if np.random.random() < 0.01:\n",
" act = np.random.randint(self.n_actions)\n",
" else:\n",
" act = self.predict(obs)\n",
" self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)\n",
" return act\n",
"\n",
" def predict(self, obs):\n",
" \"\"\"Predict an action when given an observation\n",
" Args:\n",
" obs(np.float32): shape of (n_states,)\n",
" Returns:\n",
" act(int): action\n",
" \"\"\"\n",
" obs = paddle.to_tensor(obs, dtype='float32')\n",
" pred_q = self.alg.predict(obs)\n",
" act = pred_q.argmax().numpy()[0]\n",
" return act\n",
"\n",
" def learn(self, obs, act, reward, next_obs, terminal):\n",
" \"\"\"Update model with an episode data\n",
" Args:\n",
" obs(np.float32): shape of (batch_size, n_states)\n",
" act(np.int32): shape of (batch_size)\n",
" reward(np.float32): shape of (batch_size)\n",
" next_obs(np.float32): shape of (batch_size, n_states)\n",
" terminal(np.float32): shape of (batch_size)\n",
" Returns:\n",
" loss(float)\n",
" \"\"\"\n",
" if self.global_step % self.update_target_steps == 0:\n",
" self.alg.sync_target()\n",
" self.global_step += 1\n",
"\n",
" act = np.expand_dims(act, axis=-1)\n",
" reward = np.expand_dims(reward, axis=-1)\n",
" terminal = np.expand_dims(terminal, axis=-1)\n",
"\n",
" obs = paddle.to_tensor(obs, dtype='float32')\n",
" act = paddle.to_tensor(act, dtype='int32')\n",
" reward = paddle.to_tensor(reward, dtype='float32')\n",
" next_obs = paddle.to_tensor(next_obs, dtype='float32')\n",
" terminal = paddle.to_tensor(terminal, dtype='float32')\n",
" loss = self.alg.learn(obs, act, reward, next_obs, terminal)\n",
" return loss.numpy()[0]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import gym\n",
"import numpy as np\n",
"import parl\n",
"\n",
"from parl.utils import logger, ReplayMemory\n",
"from parl.algorithms import DQN\n",
"\n",
"LEARN_FREQ = 5 # training frequency\n",
"MEMORY_SIZE = 200000\n",
"MEMORY_WARMUP_SIZE = 200\n",
"BATCH_SIZE = 64\n",
"LEARNING_RATE = 0.0005\n",
"GAMMA = 0.99\n",
"\n",
"# train an episode\n",
"def run_train_episode(agent, env, rpm):\n",
" total_reward = 0\n",
" obs = env.reset()\n",
" step = 0\n",
" while True:\n",
" step += 1\n",
" action = agent.sample(obs)\n",
" next_obs, reward, done, _ = env.step(action)\n",
" rpm.append(obs, action, reward, next_obs, done)\n",
"\n",
" # train model\n",
" if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):\n",
" # s,a,r,s',done\n",
" (batch_obs, batch_action, batch_reward, batch_next_obs,\n",
" batch_done) = rpm.sample_batch(BATCH_SIZE)\n",
" train_loss = agent.learn(batch_obs, batch_action, batch_reward,\n",
" batch_next_obs, batch_done)\n",
"\n",
" total_reward += reward\n",
" obs = next_obs\n",
" if done:\n",
" break\n",
" return total_reward\n",
"\n",
"\n",
"# evaluate 5 episodes\n",
"def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):\n",
" eval_reward = []\n",
" for i in range(eval_episodes):\n",
" obs = env.reset()\n",
" episode_reward = 0\n",
" while True:\n",
" action = agent.predict(obs)\n",
" obs, reward, done, _ = env.step(action)\n",
" episode_reward += reward\n",
" if render:\n",
" env.render()\n",
" if done:\n",
" break\n",
" eval_reward.append(episode_reward)\n",
" return np.mean(eval_reward)\n",
"\n",
"\n",
"def main(args):\n",
" env = gym.make('CartPole-v0')\n",
" n_states = env.observation_space.shape[0]\n",
" n_actions = env.action_space.n\n",
" logger.info('n_states {}, n_actions {}'.format(n_states, n_actions))\n",
"\n",
" # set action_shape = 0 while in discrete control environment\n",
" rpm = ReplayMemory(MEMORY_SIZE, n_states, 0)\n",
"\n",
" # build an agent\n",
" model = CartpoleModel(n_states=n_states, n_actions=n_actions)\n",
" alg = DQN(model, gamma=GAMMA, lr=LEARNING_RATE)\n",
" agent = CartpoleAgent(\n",
" alg, n_actions=n_actions, e_greed=0.1, e_greed_decrement=1e-6)\n",
"\n",
" # warmup memory\n",
" while len(rpm) < MEMORY_WARMUP_SIZE:\n",
" run_train_episode(agent, env, rpm)\n",
"\n",
" max_episode = args.max_episode\n",
"\n",
" # start training\n",
" episode = 0\n",
" while episode < max_episode:\n",
" # train part\n",
" for i in range(50):\n",
" total_reward = run_train_episode(agent, env, rpm)\n",
" episode += 1\n",
"\n",
" # test part\n",
" eval_reward = run_evaluate_episodes(agent, env, render=False)\n",
" logger.info('episode:{} e_greed:{} Test reward:{}'.format(\n",
" episode, agent.e_greed, eval_reward))\n",
"\n",
" # save the parameters to ./model.ckpt\n",
" save_path = './model.ckpt'\n",
" agent.save(save_path)\n",
"\n",
" # save the model and parameters of policy network for inference\n",
" save_inference_path = './inference_model'\n",
" input_shapes = [[None, env.observation_space.shape[0]]]\n",
" input_dtypes = ['float32']\n",
" agent.save_inference_model(save_inference_path, input_shapes, input_dtypes)\n",
"\n",
"\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:64]\u001b[0m obs_dim 4, act_dim 2\n",
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:92]\u001b[0m episode:50 e_greed:0.0988929999999989 Test reward:18.4\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:100 e_greed:0.09794799999999795 Test reward:9.6\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:150 e_greed:0.0973899999999974 Test reward:37.8\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:200 e_greed:0.09684299999999685 Test reward:8.8\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:250 e_greed:0.09635499999999636 Test reward:9.4\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:300 e_greed:0.09585299999999586 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:350 e_greed:0.09535799999999536 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:400 e_greed:0.09486399999999487 Test reward:10.0\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:450 e_greed:0.09435299999999436 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:500 e_greed:0.09384899999999385 Test reward:9.4\n",
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:550 e_greed:0.09302299999999303 Test reward:69.0\n",
"\u001b[32m[08-01 21:48:25 MainThread @3996942455.py:92]\u001b[0m episode:600 e_greed:0.08774199999998775 Test reward:141.2\n",
"\u001b[32m[08-01 21:48:30 MainThread @3996942455.py:92]\u001b[0m episode:650 e_greed:0.0791019999999791 Test reward:184.0\n",
"\u001b[32m[08-01 21:48:35 MainThread @3996942455.py:92]\u001b[0m episode:700 e_greed:0.07011299999997012 Test reward:182.0\n",
"\u001b[32m[08-01 21:48:40 MainThread @3996942455.py:92]\u001b[0m episode:750 e_greed:0.06089099999996089 Test reward:197.4\n",
"\u001b[32m[08-01 21:48:45 MainThread @3996942455.py:92]\u001b[0m episode:800 e_greed:0.05139199999995139 Test reward:183.4\n",
"\u001b[32m[08-01 21:48:50 MainThread @3996942455.py:92]\u001b[0m episode:850 e_greed:0.042255999999942256 Test reward:153.0\n",
"\u001b[32m[08-01 21:48:55 MainThread @3996942455.py:92]\u001b[0m episode:900 e_greed:0.033495999999933496 Test reward:192.6\n",
"\u001b[32m[08-01 21:49:00 MainThread @3996942455.py:92]\u001b[0m episode:950 e_greed:0.024318999999924318 Test reward:166.6\n",
"\u001b[32m[08-01 21:49:06 MainThread @3996942455.py:92]\u001b[0m episode:1000 e_greed:0.014873999999916176 Test reward:187.0\n"
]
}
],
"source": [
"import argparse\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument(\n",
" '--max_episode',\n",
" type=int,\n",
" default=1000,\n",
" help='stop condition: number of max episode')\n",
"args = parser.parse_args(args=[])\n",
"\n",
"main(args)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.12 ('rl_tutorials')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4f613f1ab80ec98dc1b91d6e720de51301598a187317378e53e49b773c1123dd"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,11 +0,0 @@
[PARL](https://github.com/PaddlePaddle/PARL)是一个高性能、灵活的强化学习框架由百度AI Studio开发。
## 安装
1. 安装parl参考[PARL Github](https://github.com/PaddlePaddle/PARL)
2. 安装paddlepaddle```pip install paddlepaddle```
## 常见问题
```jupyter-client 7.3.1 requires pyzmq>=22.3, but you have pyzmq 18.1.1 which is incompatible.```:
```pip install -U pyzmq```

View File

@@ -11,7 +11,6 @@
项目内容主要包含以下几个部分:
* [Jupyter Notebook](./notebooks/)使用Notebook写的算法有比较详细的实战引导推荐新手食用
* [codes](./codes/)这些是基于Python脚本写的算法风格比较接近实际项目的写法推荐有一定代码基础的人阅读下面会说明其具体的一些架构
* [parl](./PARL/):应业务需求,写了一些基于百度飞浆平台和```parl```模块的RL实例
* [附件](./assets/):目前包含强化学习各算法的中文伪代码
@@ -23,15 +22,15 @@
注:点击对应的名称会跳到[codes](./codes/)下对应的算法中,其他版本还请读者自行翻阅
| 算法名称 | 参考文献 | 备注 |
| :-----------------------: | :----------------------------------------------------------: | :--: |
| | | |
| DQN-CNN | | 待更 |
| [SoftQ](codes/SoftQ) | [Soft Q-learning paper](https://arxiv.org/abs/1702.08165) | |
| [SAC](codes/SAC) | [SAC paper](https://arxiv.org/pdf/1812.05905.pdf) | |
| [SAC-Discrete](codes/SAC) | [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf) | |
| SAC-S | [SAC-S paper](https://arxiv.org/abs/1801.01290) | |
| DSAC | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 |
| 算法名称 | 参考文献 | 备注 |
| :-------------------------------------: | :----------------------------------------------------------: | :--: |
| [Policy Gradient](codes/PolicyGradient) | [Policy Gradient paper](https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf) | |
| DQN-CNN | | 待更 |
| [SoftQ](codes/SoftQ) | [Soft Q-learning paper](https://arxiv.org/abs/1702.08165) | |
| [SAC](codes/SAC) | [SAC paper](https://arxiv.org/pdf/1812.05905.pdf) | |
| [SAC-Discrete](codes/SAC) | [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf) | |
| SAC-S | [SAC-S paper](https://arxiv.org/abs/1801.01290) | |
| DSAC | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 |
## 3、算法环境

View File

@@ -1,35 +0,0 @@
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand*\HyPL@Entry[1]{}
\HyPL@Entry{0<</S/D>>}
\@writefile{toc}{\contentsline {section}{\numberline {1}模版备用}{2}{section.1}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{2}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {2}Q learning算法}{3}{section.2}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{3}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Sarsa算法}{4}{section.3}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{4}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{5}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{6}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{7}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{8}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{9}{algorithm.}\protected@file@percent }
\gdef \@abspage@last{9}

View File

@@ -1,570 +0,0 @@
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 23 AUG 2022 19:26
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
%&-line parsing enabled.
**/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes
(/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes.tex
LaTeX2e <2020-10-01> patch level 4
L3 programming layer <2021-02-18> (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexart.cls (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexbackend.cfg
File: ctexbackend.cfg 2021/03/14 v2.5.6 Backend configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3kernel/expl3.sty
Package: expl3 2021-02-18 L3 programming layer (loader)
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
File: l3backend-xetex.def 2021-03-18 L3 backend support: XeTeX
(|extractbb --version)
\c__kernel_sys_dvipdfmx_version_int=\count175
\l__color_backend_stack_int=\count176
\g__color_backend_stack_int=\count177
\g__graphics_track_int=\count178
\l__pdf_internal_box=\box47
\g__pdf_backend_object_int=\count179
\g__pdf_backend_annotation_int=\count180
\g__pdf_backend_link_int=\count181
))
Document Class: ctexart 2021/03/14 v2.5.6 Chinese adapter for class article (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-2020-10-01.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-generic.tex))) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.sty
Package: l3keys2e 2021-03-12 LaTeX2e option processing using LaTeX3 keys
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexhook.sty
Package: ctexhook 2021/03/14 v2.5.6 Document and package hooks (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexpatch.sty
Package: ctexpatch 2021/03/14 v2.5.6 Patching commands (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/fix-cm.sty
Package: fix-cm 2015/01/14 v1.1t fixes to LaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/ts1enc.def
File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
LaTeX Font Info: Redeclaring font encoding TS1 on input line 47.
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel.sty
Package: everysel 2021/01/20 v2.1 EverySelectfont Package (MS)
(/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel-2011-10-28.sty))
\l__ctex_tmp_int=\count182
\l__ctex_tmp_box=\box48
\l__ctex_tmp_dim=\dimen138
\g__ctex_section_depth_int=\count183
\g__ctex_font_size_int=\count184
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexopts.cfg
File: ctexopts.cfg 2021/03/14 v2.5.6 Option configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/article.cls
Document Class: article 2020/04/10 v1.4m Standard LaTeX document class
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/size11.clo
File: size11.clo 2020/04/10 v1.4m Standard LaTeX file (size option)
)
\c@part=\count185
\c@section=\count186
\c@subsection=\count187
\c@subsubsection=\count188
\c@paragraph=\count189
\c@subparagraph=\count190
\c@figure=\count191
\c@table=\count192
\abovecaptionskip=\skip47
\belowcaptionskip=\skip48
\bibindent=\dimen139
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/engine/ctex-engine-xetex.def
File: ctex-engine-xetex.def 2021/03/14 v2.5.6 XeLaTeX adapter (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.sty
Package: xeCJK 2020/10/19 v3.8.6 Typesetting CJK scripts with XeLaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xtemplate/xtemplate.sty
Package: xtemplate 2021-03-12 L3 Experimental prototype document functions
\l__xtemplate_tmp_dim=\dimen140
\l__xtemplate_tmp_int=\count193
\l__xtemplate_tmp_muskip=\muskip16
\l__xtemplate_tmp_skip=\skip49
)
\l__xeCJK_tmp_int=\count194
\l__xeCJK_tmp_box=\box49
\l__xeCJK_tmp_dim=\dimen141
\l__xeCJK_tmp_skip=\skip50
\g__xeCJK_space_factor_int=\count195
\l__xeCJK_begin_int=\count196
\l__xeCJK_end_int=\count197
\c__xeCJK_CJK_class_int=\XeTeXcharclass1
\c__xeCJK_FullLeft_class_int=\XeTeXcharclass2
\c__xeCJK_FullRight_class_int=\XeTeXcharclass3
\c__xeCJK_HalfLeft_class_int=\XeTeXcharclass4
\c__xeCJK_HalfRight_class_int=\XeTeXcharclass5
\c__xeCJK_NormalSpace_class_int=\XeTeXcharclass6
\c__xeCJK_CM_class_int=\XeTeXcharclass7
\c__xeCJK_HangulJamo_class_int=\XeTeXcharclass8
\l__xeCJK_last_skip=\skip51
\g__xeCJK_node_int=\count198
\c__xeCJK_CJK_node_dim=\dimen142
\c__xeCJK_CJK-space_node_dim=\dimen143
\c__xeCJK_default_node_dim=\dimen144
\c__xeCJK_default-space_node_dim=\dimen145
\c__xeCJK_CJK-widow_node_dim=\dimen146
\c__xeCJK_normalspace_node_dim=\dimen147
\l__xeCJK_ccglue_skip=\skip52
\l__xeCJK_ecglue_skip=\skip53
\l__xeCJK_punct_kern_skip=\skip54
\l__xeCJK_last_penalty_int=\count199
\l__xeCJK_last_bound_dim=\dimen148
\l__xeCJK_last_kern_dim=\dimen149
\l__xeCJK_widow_penalty_int=\count266
Package xtemplate Info: Declaring object type 'xeCJK/punctuation' taking 0
(xtemplate) argument(s) on line 2341.
\l__xeCJK_fixed_punct_width_dim=\dimen150
\l__xeCJK_mixed_punct_width_dim=\dimen151
\l__xeCJK_middle_punct_width_dim=\dimen152
\l__xeCJK_fixed_margin_width_dim=\dimen153
\l__xeCJK_mixed_margin_width_dim=\dimen154
\l__xeCJK_middle_margin_width_dim=\dimen155
\l__xeCJK_bound_punct_width_dim=\dimen156
\l__xeCJK_bound_margin_width_dim=\dimen157
\l__xeCJK_margin_minimum_dim=\dimen158
\l__xeCJK_kerning_total_width_dim=\dimen159
\l__xeCJK_same_align_margin_dim=\dimen160
\l__xeCJK_different_align_margin_dim=\dimen161
\l__xeCJK_kerning_margin_width_dim=\dimen162
\l__xeCJK_kerning_margin_minimum_dim=\dimen163
\l__xeCJK_bound_dim=\dimen164
\l__xeCJK_reverse_bound_dim=\dimen165
\l__xeCJK_margin_dim=\dimen166
\l__xeCJK_minimum_bound_dim=\dimen167
\l__xeCJK_kerning_margin_dim=\dimen168
\g__xeCJK_family_int=\count267
\l__xeCJK_fam_int=\count268
\g__xeCJK_fam_allocation_int=\count269
\l__xeCJK_verb_case_int=\count270
\l__xeCJK_verb_exspace_skip=\skip55
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.sty
Package: fontspec 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
Package: fontspec-xetex 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
\l__fontspec_script_int=\count271
\l__fontspec_language_int=\count272
\l__fontspec_strnum_int=\count273
\l__fontspec_tmp_int=\count274
\l__fontspec_tmpa_int=\count275
\l__fontspec_tmpb_int=\count276
\l__fontspec_tmpc_int=\count277
\l__fontspec_em_int=\count278
\l__fontspec_emdef_int=\count279
\l__fontspec_strong_int=\count280
\l__fontspec_strongdef_int=\count281
\l__fontspec_tmpa_dim=\dimen169
\l__fontspec_tmpb_dim=\dimen170
\l__fontspec_tmpc_dim=\dimen171
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/fontenc.sty
Package: fontenc 2020/08/10 v2.0s Standard LaTeX package
) (/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
File: xeCJK.cfg 2020/10/19 v3.8.6 Configuration file for xeCJK package
))
\ccwd=\dimen172
\l__ctex_ccglue_skip=\skip56
)
\l__ctex_ziju_dim=\dimen173
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber.sty
Package: zhnumber 2020/05/01 v2.8 Typesetting numbers with Chinese glyphs
\l__zhnum_scale_int=\count282
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber-utf8.cfg
File: zhnumber-utf8.cfg 2020/05/01 v2.8 Chinese numerals with UTF8 encoding
))
\l__ctex_heading_skip=\skip57
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/scheme/ctex-scheme-chinese-article.def
File: ctex-scheme-chinese-article.def 2021/03/14 v2.5.6 Chinese scheme for article (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex-name-utf8.cfg
File: ctex-name-utf8.cfg 2021/03/14 v2.5.6 Caption with encoding UTF-8 (CTEX)
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-mac.def
File: ctex-fontset-mac.def 2021/03/14 v2.5.6 macOS fonts definition (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-macnew.def
File: ctex-fontset-macnew.def 2021/03/14 v2.5.6 macOS fonts definition for El Capitan or later version (CTEX)
Package fontspec Warning: Font "Songti SC Light" does not contain requested
(fontspec) Script "CJK".
Package fontspec Info: Font family 'SongtiSCLight(0)' created for font 'Songti
(fontspec) SC Light' with options
(fontspec) [Script={CJK},BoldItalicFont={Kaiti SC
(fontspec) Bold},BoldFont={Songti SC Bold},ItalicFont={Kaiti SC}].
(fontspec)
(fontspec) This font family consists of the following NFSS
(fontspec) series/shapes:
(fontspec)
(fontspec) - 'normal' (m/n) with NFSS spec.: <->"Songti SC
(fontspec) Light/OT:language=dflt;"
(fontspec) - 'small caps' (m/sc) with NFSS spec.:
(fontspec) - 'bold' (b/n) with NFSS spec.: <->"Songti SC
(fontspec) Bold/OT:language=dflt;"
(fontspec) - 'bold small caps' (b/sc) with NFSS spec.:
(fontspec) - 'italic' (m/it) with NFSS spec.: <->"Kaiti
(fontspec) SC/OT:language=dflt;"
(fontspec) - 'italic small caps' (m/scit) with NFSS spec.:
(fontspec) - 'bold italic' (b/it) with NFSS spec.: <->"Kaiti SC
(fontspec) Bold/OT:language=dflt;"
(fontspec) - 'bold italic small caps' (b/scit) with NFSS spec.:
))) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex.cfg
File: ctex.cfg 2021/03/14 v2.5.6 Configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithm.sty
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
(/usr/local/texlive/2021/texmf-dist/tex/latex/float/float.sty
Package: float 2001/11/08 v1.3d Float enhancements (AL)
\c@float@type=\count283
\float@exts=\toks15
\float@box=\box50
\@float@everytoks=\toks16
\@floatcapt=\box51
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/ifthen.sty
Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
)
\@float@every@algorithm=\toks17
\c@algorithm=\count284
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithmic.sty
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
(/usr/local/texlive/2021/texmf-dist/tex/latex/graphics/keyval.sty
Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
\KV@toks@=\toks18
)
\c@ALC@unique=\count285
\c@ALC@line=\count286
\c@ALC@rem=\count287
\c@ALC@depth=\count288
\ALC@tlm=\skip58
\algorithmicindent=\skip59
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amssymb.sty
Package: amssymb 2013/01/14 v3.01 AMS font symbols
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amsfonts.sty
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
\@emptytoks=\toks19
\symAMSa=\mathgroup4
\symAMSb=\mathgroup5
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsmath.sty
Package: amsmath 2020/09/23 v2.17i AMS math features
\@mathmargin=\skip60
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amstext.sty
Package: amstext 2000/06/29 v2.01 AMS text
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsgen.sty
File: amsgen.sty 1999/11/30 v2.0 generic functions
\@emptytoks=\toks20
\ex@=\dimen174
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsbsy.sty
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
\pmbraise@=\dimen175
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsopn.sty
Package: amsopn 2016/03/08 v2.02 operator names
)
\inf@bad=\count289
LaTeX Info: Redefining \frac on input line 234.
\uproot@=\count290
\leftroot@=\count291
LaTeX Info: Redefining \overline on input line 399.
\classnum@=\count292
\DOTSCASE@=\count293
LaTeX Info: Redefining \ldots on input line 496.
LaTeX Info: Redefining \dots on input line 499.
LaTeX Info: Redefining \cdots on input line 620.
\Mathstrutbox@=\box52
\strutbox@=\box53
\big@size=\dimen176
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
\macc@depth=\count294
\c@MaxMatrixCols=\count295
\dotsspace@=\muskip17
\c@parentequation=\count296
\dspbrk@lvl=\count297
\tag@help=\toks21
\row@=\count298
\column@=\count299
\maxfields@=\count300
\andhelp@=\toks22
\eqnshift@=\dimen177
\alignsep@=\dimen178
\tagshift@=\dimen179
\tagwidth@=\dimen180
\totwidth@=\dimen181
\lineht@=\dimen182
\@envbody=\toks23
\multlinegap=\skip61
\multlinetaggap=\skip62
\mathdisplay@stack=\toks24
LaTeX Info: Redefining \[ on input line 2923.
LaTeX Info: Redefining \] on input line 2924.
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hyperref.sty
Package: hyperref 2021-02-27 v7.00k Hypertext links for LaTeX
(/usr/local/texlive/2021/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/iftex/iftex.sty
Package: iftex 2020/03/06 v1.0d TeX engine tests
) (/usr/local/texlive/2021/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
(/usr/local/texlive/2021/texmf-dist/tex/generic/infwarerr/infwarerr.sty
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
)
Package pdftexcmds Info: \pdf@primitive is available.
Package pdftexcmds Info: \pdf@ifprimitive is available.
Package pdftexcmds Info: \pdfdraftmode not found.
) (/usr/local/texlive/2021/texmf-dist/tex/generic/kvsetkeys/kvsetkeys.sty
Package: kvsetkeys 2019/12/15 v1.18 Key value parser (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/pdfescape/pdfescape.sty
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hycolor/hycolor.sty
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/auxhook/auxhook.sty
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/kvoptions/kvoptions.sty
Package: kvoptions 2020-10-07 v3.14 Key value format for package options (HO)
)
\@linkdim=\dimen183
\Hy@linkcounter=\count301
\Hy@pagecounter=\count302
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/pd1enc.def
File: pd1enc.def 2021-02-27 v7.00k Hyperref: PDFDocEncoding definition (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hyperref-langpatches.def
File: hyperref-langpatches.def 2021-02-27 v7.00k Hyperref: patches for babel languages
) (/usr/local/texlive/2021/texmf-dist/tex/generic/intcalc/intcalc.sty
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/etexcmds/etexcmds.sty
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
)
\Hy@SavedSpaceFactor=\count303
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/puenc.def
File: puenc.def 2021-02-27 v7.00k Hyperref: PDF Unicode definition (HO)
)
Package hyperref Info: Option `unicode' set `true' on input line 4073.
Package hyperref Info: Hyper figures OFF on input line 4192.
Package hyperref Info: Link nesting OFF on input line 4197.
Package hyperref Info: Hyper index ON on input line 4200.
Package hyperref Info: Plain pages OFF on input line 4207.
Package hyperref Info: Backreferencing OFF on input line 4212.
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
Package hyperref Info: Bookmarks ON on input line 4445.
\c@Hy@tempcnt=\count304
(/usr/local/texlive/2021/texmf-dist/tex/latex/url/url.sty
\Urlmuskip=\muskip18
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
)
LaTeX Info: Redefining \url on input line 4804.
\XeTeXLinkMargin=\dimen184
(/usr/local/texlive/2021/texmf-dist/tex/generic/bitset/bitset.sty
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
(/usr/local/texlive/2021/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
))
\Fld@menulength=\count305
\Field@Width=\dimen185
\Fld@charsize=\dimen186
Package hyperref Info: Hyper figures OFF on input line 6075.
Package hyperref Info: Link nesting OFF on input line 6080.
Package hyperref Info: Hyper index ON on input line 6083.
Package hyperref Info: backreferencing OFF on input line 6090.
Package hyperref Info: Link coloring OFF on input line 6095.
Package hyperref Info: Link coloring with OCG OFF on input line 6100.
Package hyperref Info: PDF/A mode OFF on input line 6105.
LaTeX Info: Redefining \ref on input line 6145.
LaTeX Info: Redefining \pageref on input line 6149.
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/atbegshi-ltx.sty
Package: atbegshi-ltx 2020/08/17 v1.0a Emulation of the original atbegshi package
with kernel methods
)
\Hy@abspage=\count306
\c@Item=\count307
\c@Hfootnote=\count308
)
Package hyperref Info: Driver (autodetected): hxetex.
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hxetex.def
File: hxetex.def 2021-02-27 v7.00k Hyperref driver for XeTeX
(/usr/local/texlive/2021/texmf-dist/tex/generic/stringenc/stringenc.sty
Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO)
)
\pdfm@box=\box54
\c@Hy@AnnotLevel=\count309
\HyField@AnnotCount=\count310
\Fld@listcount=\count311
\c@bookmark@seq@number=\count312
(/usr/local/texlive/2021/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
Package: rerunfilecheck 2019/12/05 v1.9 Rerun checks for auxiliary files (HO)
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/atveryend-ltx.sty
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atvery package
with kernel methods
) (/usr/local/texlive/2021/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
)
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 286.
)
\Hy@SectionHShift=\skip63
) (/usr/local/texlive/2021/texmf-dist/tex/latex/setspace/setspace.sty
Package: setspace 2011/12/19 v6.7a set line spacing
) (/usr/local/texlive/2021/texmf-dist/tex/latex/titlesec/titlesec.sty
Package: titlesec 2019/10/16 v2.13 Sectioning titles
\ttl@box=\box55
\beforetitleunit=\skip64
\aftertitleunit=\skip65
\ttl@plus=\dimen187
\ttl@minus=\dimen188
\ttl@toksa=\toks25
\titlewidth=\dimen189
\titlewidthlast=\dimen190
\titlewidthfirst=\dimen191
) (./pseudocodes.aux)
\openout1 = `pseudocodes.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
ABD: EverySelectfont initializing macros
LaTeX Info: Redefining \selectfont on input line 14.
Package fontspec Info: Adjusting the maths setup (use [no-math] to avoid
(fontspec) this).
\symlegacymaths=\mathgroup6
LaTeX Font Info: Overwriting symbol font `legacymaths' in version `bold'
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 14.
LaTeX Font Info: Redeclaring math accent \acute on input line 14.
LaTeX Font Info: Redeclaring math accent \grave on input line 14.
LaTeX Font Info: Redeclaring math accent \ddot on input line 14.
LaTeX Font Info: Redeclaring math accent \tilde on input line 14.
LaTeX Font Info: Redeclaring math accent \bar on input line 14.
LaTeX Font Info: Redeclaring math accent \breve on input line 14.
LaTeX Font Info: Redeclaring math accent \check on input line 14.
LaTeX Font Info: Redeclaring math accent \hat on input line 14.
LaTeX Font Info: Redeclaring math accent \dot on input line 14.
LaTeX Font Info: Redeclaring math accent \mathring on input line 14.
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 14.
LaTeX Font Info: Redeclaring math symbol \Delta on input line 14.
LaTeX Font Info: Redeclaring math symbol \Theta on input line 14.
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 14.
LaTeX Font Info: Redeclaring math symbol \Xi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Pi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 14.
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 14.
LaTeX Font Info: Redeclaring math symbol \Phi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Psi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Omega on input line 14.
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 14.
LaTeX Font Info: Redeclaring symbol font `operators' on input line 14.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `normal' on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `bold' on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal'
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal'
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal'
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal'
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold'
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold'
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold'
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 14.
Package hyperref Info: Link coloring OFF on input line 14.
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/nameref.sty
Package: nameref 2021-04-02 v2.47 Cross-referencing by name of section
(/usr/local/texlive/2021/texmf-dist/tex/latex/refcount/refcount.sty
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
)
\c@section@level=\count313
)
LaTeX Info: Redefining \ref on input line 14.
LaTeX Info: Redefining \pageref on input line 14.
LaTeX Info: Redefining \nameref on input line 14.
(./pseudocodes.out) (./pseudocodes.out)
\@outlinefile=\write3
\openout3 = `pseudocodes.out'.
(./pseudocodes.toc)
\tf@toc=\write4
\openout4 = `pseudocodes.toc'.
LaTeX Font Info: Font shape `TU/SongtiSCLight(0)/m/sl' in size <10.95> not available
(Font) Font shape `TU/SongtiSCLight(0)/m/it' tried instead on input line 17.
[1
]
Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 22.
[2
]
LaTeX Font Info: Trying to load font information for U+msa on input line 32.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsa.fd
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
)
LaTeX Font Info: Trying to load font information for U+msb on input line 32.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsb.fd
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
) [3
] [4
] [5
] [6
] [7
] [8
]
Overfull \hbox (32.54117pt too wide) in paragraph at lines 212--212
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^R\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 Q[] [] []$|
[]
Overfull \hbox (15.41673pt too wide) in paragraph at lines 213--213
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^^\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 ^^K [] [] \OT1/cmr/m/n/9 + [] \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 f[] []$\TU/lmr/m/n/9 ,$[][] \OT1/cmr/m/n/9 =
[]
[9
] (./pseudocodes.aux)
Package rerunfilecheck Info: File `pseudocodes.out' has not changed.
(rerunfilecheck) Checksum: 35B5A79A86EF3BC70F1A0B3BCBEBAA13;724.
)
Here is how much of TeX's memory you used:
14827 strings out of 476919
313456 string characters out of 5821840
653576 words of memory out of 5000000
34576 multiletter control sequences out of 15000+600000
413609 words of font info for 91 fonts, out of 8000000 for 9000
1348 hyphenation exceptions out of 8191
101i,13n,104p,676b,697s stack positions out of 5000i,500n,10000p,200000b,80000s
Output written on pseudocodes.pdf (9 pages).

View File

@@ -1,8 +0,0 @@
\BOOKMARK [1][-]{section.1}{\376\377\152\041\162\110\131\007\165\050}{}% 1
\BOOKMARK [1][-]{section.2}{\376\377\000Q\000\040\000l\000e\000a\000r\000n\000i\000n\000g\173\227\154\325}{}% 2
\BOOKMARK [1][-]{section.3}{\376\377\000S\000a\000r\000s\000a\173\227\154\325}{}% 3
\BOOKMARK [1][-]{section.4}{\376\377\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\173\227\154\325}{}% 4
\BOOKMARK [1][-]{section.5}{\376\377\000D\000Q\000N\173\227\154\325}{}% 5
\BOOKMARK [1][-]{section.6}{\376\377\000S\000o\000f\000t\000Q\173\227\154\325}{}% 6
\BOOKMARK [1][-]{section.7}{\376\377\000S\000A\000C\000-\000S\173\227\154\325}{}% 7
\BOOKMARK [1][-]{section.8}{\376\377\000S\000A\000C\173\227\154\325}{}% 8

View File

@@ -11,6 +11,27 @@
\usepackage{float} % 调用该包能够使用[H]
% \pagestyle{plain} % 去除页眉但是保留页脚编号都去掉plain换empty
% 更改脚注为圆圈
\usepackage{pifont}
\makeatletter
\newcommand*{\circnum}[1]{%
\expandafter\@circnum\csname c@#1\endcsname
}
\newcommand*{\@circnum}[1]{%
\ifnum#1<1 %
\@ctrerr
\else
\ifnum#1>20 %
\@ctrerr
\else
\ding{\the\numexpr 171+(#1)\relax}%
\fi
\fi
}
\makeatother
\renewcommand*{\thefootnote}{\circnum{footnote}}
\begin{document}
\tableofcontents % 目录注意要运行两下或者vscode保存两下才能显示
% \singlespacing
@@ -69,27 +90,10 @@
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Policy Gradient算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{REINFORCE算法Monte-Carlo Policy Gradient}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
\FOR {时步 = $1,t$}
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{DQN算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{DQN算法}{\hypersetup{linkcolor=white}\footnotemark}}
\floatname{algorithm}{{DQN算法}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
@@ -109,10 +113,10 @@
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE$D$中采样一个batch的transition
\STATE 计算实际的$Q$值,即$y_{j}${\hypersetup{linkcolor=white}\footnotemark}
\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降{\hypersetup{linkcolor=white}\footnotemark}
\STATE 计算实际的$Q$值,即$y_{j}$\footnotemark[2]
\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降\footnotemark[3]
\ENDFOR
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q${\hypersetup{linkcolor=white}\footnotemark}
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]]
\ENDFOR
\end{algorithmic}
\end{algorithm}
@@ -121,7 +125,46 @@
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定}
\clearpage
\section{Policy Gradient算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{REINFORCE算法Monte-Carlo Policy Gradient}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
\FOR {时步 = $1,t$}
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Advantage Actor Critic算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{Q Actor Critic算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化Actor参数$\theta$和Critic参数$w$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi_{\theta}(a|s)$采样一个(或几个)回合的transition
\STATE {\bfseries 更新Critic参数\footnotemark[1]}
\FOR {时步 = $t+1,1$}
\STATE 计算Advantage$ \delta_t = r_t + \gamma Q_w(s_{t+1},a_{t+1})-Q_w(s_t,a_t)$
\STATE $w \leftarrow w+\alpha_{w} \delta_{t} \nabla_{w} Q_w(s_t,a_t)$
\STATE $a_t \leftarrow a_{t+1}$,$s_t \leftarrow s_{t+1}$
\ENDFOR
\STATE 更新Actor参数$\theta \leftarrow \theta+\alpha_{\theta} Q_{w}(s, a) \nabla_{\theta} \log \pi_{\theta}(a \mid s)$
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{这里结合TD error的特性按照从$t+1$$1$计算法Advantage更方便}
\clearpage
\section{SoftQ算法}
\begin{algorithm}[H]
\floatname{algorithm}{{SoftQ算法}}

View File

@@ -1,8 +0,0 @@
\contentsline {section}{\numberline {1}模版备用}{2}{section.1}%
\contentsline {section}{\numberline {2}Q learning算法}{3}{section.2}%
\contentsline {section}{\numberline {3}Sarsa算法}{4}{section.3}%
\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}%
\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}%
\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}%
\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}%
\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}%

View File

@@ -1,56 +1,60 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-05-03 22:16:08
LastEditor: JiangJi
LastEditTime: 2022-07-20 23:54:40
Discription:
Environment:
'''
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
class ActorCritic(nn.Module):
''' A2C网络模型包含一个Actor和Critic
'''
def __init__(self, input_dim, output_dim, hidden_dim):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
self.actor = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1),
)
def forward(self, x):
value = self.critic(x)
probs = self.actor(x)
dist = Categorical(probs)
return dist, value
class A2C:
''' A2C算法
'''
def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters())
def __init__(self,models,memories,cfg):
self.n_actions = cfg['n_actions']
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.memory = memories['ACMemory']
self.actor = models['Actor'].to(self.device)
self.critic = models['Critic'].to(self.device)
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=cfg['actor_lr'])
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=cfg['critic_lr'])
def sample_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def predict_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def update(self,next_state,entropy):
value_pool,log_prob_pool,reward_pool = self.memory.sample()
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value = self.critic(next_state)
returns = np.zeros_like(reward_pool)
for t in reversed(range(len(reward_pool))):
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
tot_loss = actor_loss + critic_loss + 0.001 * entropy
self.actor_optim.zero_grad()
self.critic_optim.zero_grad()
tot_loss.backward()
self.actor_optim.step()
self.critic_optim.step()
self.memory.clear()
def save_model(self, path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.actor.state_dict(), f"{path}/actor_checkpoint.pt")
torch.save(self.critic.state_dict(), f"{path}/critic_checkpoint.pt")
def compute_returns(self,next_value, rewards, masks):
R = next_value
returns = []
for step in reversed(range(len(rewards))):
R = rewards[step] + self.gamma * R * masks[step]
returns.insert(0, R)
return returns
def load_model(self, path):
self.actor.load_state_dict(torch.load(f"{path}/actor_checkpoint.pt"))
self.critic.load_state_dict(torch.load(f"{path}/critic_checkpoint.pt"))

View File

@@ -0,0 +1,55 @@
import torch
import numpy as np
class A2C_2:
def __init__(self,models,memories,cfg):
self.n_actions = cfg['n_actions']
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.memory = memories['ACMemory']
self.ac_net = models['ActorCritic'].to(self.device)
self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr=cfg['lr'])
def sample_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def predict_action(self,state):
''' predict can be all wrapped with no_grad(), then donot need detach(), or you can just copy contents of 'sample_action'
'''
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state)
value = value.numpy().squeeze(0)[0] # shape(value) = (1,)
action = np.random.choice(self.n_actions, p=dist.numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def update(self,next_state,entropy):
value_pool,log_prob_pool,reward_pool = self.memory.sample()
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value,_ = self.ac_net(next_state)
returns = np.zeros_like(reward_pool)
for t in reversed(range(len(reward_pool))):
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
ac_loss = actor_loss + critic_loss + 0.001 * entropy
self.ac_optimizer.zero_grad()
ac_loss.backward()
self.ac_optimizer.step()
self.memory.clear()
def save_model(self, path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.ac_net.state_dict(), f"{path}/a2c_checkpoint.pt")
def load_model(self, path):
self.ac_net.load_state_dict(torch.load(f"{path}/a2c_checkpoint.pt"))

121
projects/codes/A2C/main.py Normal file
View File

@@ -0,0 +1,121 @@
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import datetime
import argparse
import gym
import torch
import numpy as np
from common.utils import all_seed
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorSoftmax,Critic
from envs.register import register_env
from a2c import A2C
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=1600,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--actor_lr',default=3e-4,type=float,help="learning rate of actor")
parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic")
parser.add_argument('--actor_hidden_dim',default=256,type=int,help="hidden of actor net")
parser.add_argument('--critic_hidden_dim',default=256,type=int,help="hidden of critic net")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'Actor':ActorSoftmax(cfg['n_states'],cfg['n_actions'], hidden_dim = cfg['actor_hidden_dim']),'Critic':Critic(cfg['n_states'],1,hidden_dim=cfg['critic_hidden_dim'])}
memories = {'ACMemory':PGReplay()}
agent = A2C(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action, value, dist = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
agent.memory.push((value,log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
agent.update(next_state,ep_entropy) # update agent
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action,_,_ = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

120
projects/codes/A2C/main2.py Normal file
View File

@@ -0,0 +1,120 @@
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import datetime
import argparse
import gym
import torch
import numpy as np
from common.utils import all_seed
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorCriticSoftmax
from envs.register import register_env
from a2c_2 import A2C_2
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=2000,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=3e-4,type=float,help="learning rate")
parser.add_argument('--actor_hidden_dim',default=256,type=int)
parser.add_argument('--critic_hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'ActorCritic':ActorCriticSoftmax(cfg['n_states'],cfg['n_actions'], actor_hidden_dim = cfg['actor_hidden_dim'],critic_hidden_dim=cfg['critic_hidden_dim'])}
memories = {'ACMemory':PGReplay()}
agent = A2C_2(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action, value, dist = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
agent.memory.push((value,log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
agent.update(next_state,ep_entropy) # update agent
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action,_,_ = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -0,0 +1,19 @@
{
"algo_name": "A2C",
"env_name": "CartPole-v0",
"train_eps": 2000,
"test_eps": 20,
"ep_max_steps": 100000,
"gamma": 0.99,
"lr": 0.0003,
"actor_hidden_dim": 256,
"critic_hidden_dim": 256,
"device": "cpu",
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/models/",
"n_states": 4,
"n_actions": 2
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,200.0,200
1,200.0,200
2,93.0,93
3,155.0,155
4,116.0,116
5,200.0,200
6,190.0,190
7,176.0,176
8,200.0,200
9,200.0,200
10,200.0,200
11,179.0,179
12,200.0,200
13,185.0,185
14,191.0,191
15,200.0,200
16,200.0,200
17,124.0,124
18,200.0,200
19,172.0,172
1 episodes rewards steps
2 0 200.0 200
3 1 200.0 200
4 2 93.0 93
5 3 155.0 155
6 4 116.0 116
7 5 200.0 200
8 6 190.0 190
9 7 176.0 176
10 8 200.0 200
11 9 200.0 200
12 10 200.0 200
13 11 179.0 179
14 12 200.0 200
15 13 185.0 185
16 14 191.0 191
17 15 200.0 200
18 16 200.0 200
19 17 124.0 124
20 18 200.0 200
21 19 172.0 172

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "A2C", "env_name": "CartPole-v0", "train_eps": 1600, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "actor_lr": 0.0003, "critic_lr": 0.001, "actor_hidden_dim": 256, "critic_hidden_dim": 256, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,177.0,177
1,180.0,180
2,200.0,200
3,200.0,200
4,167.0,167
5,124.0,124
6,128.0,128
7,200.0,200
8,200.0,200
9,200.0,200
10,186.0,186
11,187.0,187
12,200.0,200
13,176.0,176
14,200.0,200
15,200.0,200
16,200.0,200
17,200.0,200
18,185.0,185
19,180.0,180
1 episodes rewards steps
2 0 177.0 177
3 1 180.0 180
4 2 200.0 200
5 3 200.0 200
6 4 167.0 167
7 5 124.0 124
8 6 128.0 128
9 7 200.0 200
10 8 200.0 200
11 9 200.0 200
12 10 186.0 186
13 11 187.0 187
14 12 200.0 200
15 13 176.0 176
16 14 200.0 200
17 15 200.0 200
18 16 200.0 200
19 17 200.0 200
20 18 185.0 185
21 19 180.0 180

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

56
projects/codes/A3C/a3c.py Normal file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-05-03 22:16:08
LastEditor: JiangJi
LastEditTime: 2022-07-20 23:54:40
Discription:
Environment:
'''
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
class ActorCritic(nn.Module):
''' A2C网络模型包含一个Actor和Critic
'''
def __init__(self, input_dim, output_dim, hidden_dim):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
self.actor = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1),
)
def forward(self, x):
value = self.critic(x)
probs = self.actor(x)
dist = Categorical(probs)
return dist, value
class A2C:
''' A2C算法
'''
def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters())
def compute_returns(self,next_value, rewards, masks):
R = next_value
returns = []
for step in reversed(range(len(rewards))):
R = rewards[step] + self.gamma * R * masks[step]
returns.insert(0, R)
return returns

View File

@@ -10,7 +10,7 @@ import torch.optim as optim
import datetime
import argparse
from common.multiprocessing_env import SubprocVecEnv
from a2c import ActorCritic
from a3c import ActorCritic
from common.utils import save_results, make_dir
from common.utils import plot_rewards, save_args

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2022-08-23 23:59:54
LastEditTime: 2022-08-29 23:30:08
@Discription:
@Environment: python 3.7.7
'''
@@ -78,7 +78,7 @@ class DQN:
self.batch_size)
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
# print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape)
@@ -91,7 +91,7 @@ class DQN:
# compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
# print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad)
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
# backpropagation
self.optimizer.zero_grad()
loss.backward()

View File

@@ -9,129 +9,122 @@ import torch
import datetime
import numpy as np
import argparse
from common.utils import save_results,all_seed
from common.utils import plot_rewards,save_args
from common.utils import all_seed
from common.models import MLP
from common.memories import ReplayBuffer
from common.launcher import Launcher
from envs.register import register_env
from dqn import DQN
class Main(Launcher):
def get_args(self):
""" hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
# please manually change the following args in this script if you want
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models' )
args = parser.parse_args()
args = {**vars(args)} # type(dict)
return args
def get_args():
""" hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
# please manually change the following args in this script if you want
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models' )
args = parser.parse_args()
args = {**vars(args)} # type(dict)
return args
def env_agent_config(cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
agent = DQN(model,memory,cfg) # create agent
return env, agent
def env_agent_config(cfg):
''' create env and agent
'''
env = gym.make(cfg['env_name']) # create env
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
n_states = env.observation_space.shape[0] # state dimension
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
agent = DQN(model,memory,cfg) # create agent
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg["train_eps"]):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
ep_step += 1
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.memory.push(state, action, reward,
next_state, done) # save transitions
state = next_state # update next state for env
agent.update() # update agent
ep_reward += reward #
if done:
break
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!")
env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
return res_dic
def train(cfg, env, agent):
''' 训练
'''
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg["train_eps"]):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
while True:
ep_step += 1
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.memory.push(state, action, reward,
next_state, done) # save transitions
state = next_state # update next state for env
agent.update() # update agent
ep_reward += reward #
if done:
break
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!")
env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
return res_dic
def test(cfg, env, agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
while True:
ep_step+=1
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards}
def test(cfg, env, agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
ep_step+=1
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
cfg = get_args()
# training
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg['result_path']) # save parameters
agent.save_model(path = cfg['model_path']) # save models
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg['model_path']) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg['result_path'])
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
main = Main()
main.run()

View File

@@ -1 +1,21 @@
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 10, "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", "show_fig": false, "save_fig": true}
{
"algo_name": "DQN",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.95,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 500,
"lr": 0.0001,
"memory_capacity": 100000,
"batch_size": 64,
"target_update": 4,
"hidden_dim": 256,
"device": "cpu",
"seed": 10,
"result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results",
"model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models",
"show_fig": false,
"save_fig": true
}

View File

@@ -0,0 +1,24 @@
{
"algo_name": "DQN",
"env_name": "CartPole-v1",
"train_eps": 2000,
"test_eps": 20,
"ep_max_steps": 100000,
"gamma": 0.99,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 6000,
"lr": 1e-05,
"memory_capacity": 200000,
"batch_size": 64,
"target_update": 4,
"hidden_dim": 256,
"device": "cuda",
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models",
"n_states": 4,
"n_actions": 2
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,371.0,371
1,446.0,446
2,300.0,300
3,500.0,500
4,313.0,313
5,500.0,500
6,341.0,341
7,489.0,489
8,304.0,304
9,358.0,358
10,278.0,278
11,500.0,500
12,500.0,500
13,500.0,500
14,500.0,500
15,476.0,476
16,308.0,308
17,394.0,394
18,500.0,500
19,500.0,500
1 episodes rewards steps
2 0 371.0 371
3 1 446.0 446
4 2 300.0 300
5 3 500.0 500
6 4 313.0 313
7 5 500.0 500
8 6 341.0 341
9 7 489.0 489
10 8 304.0 304
11 9 358.0 358
12 10 278.0 278
13 11 500.0 500
14 12 500.0 500
15 13 500.0 500
16 14 500.0 500
17 15 476.0 476
18 16 308.0 308
19 17 394.0 394
20 18 500.0 500
21 19 500.0 500

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2022-07-21 00:08:26
LastEditTime: 2022-08-29 23:34:20
@Discription:
@Environment: python 3.7.7
'''
@@ -20,148 +20,87 @@ import torch.nn.functional as F
import random
import math
import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class DoubleDQN:
def __init__(self, n_states, n_actions, model, memory, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = torch.device(cfg.device) # 设备cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = model.to(self.device)
self.target_net = model.to(self.device)
def __init__(self,models, memories, cfg):
self.n_actions = cfg['n_actions']
self.device = torch.device(cfg['device'])
self.gamma = cfg['gamma']
## e-greedy parameters
self.sample_count = 0 # sample count for epsilon decay
self.epsilon_start = cfg['epsilon_start']
self.epsilon_end = cfg['epsilon_end']
self.epsilon_decay = cfg['epsilon_decay']
self.batch_size = cfg['batch_size']
self.policy_net = models['Qnet'].to(self.device)
self.target_net = models['Qnet'].to(self.device)
# target_net copy from policy_net
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(param.data)
# self.target_net.eval() # 不启用 BatchNormalization Dropout
# 可查parameters()state_dict()的区别,前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0
self.memory = memory
# self.target_net.eval() # donnot use BatchNormalization or Dropout
# the difference between parameters() and state_dict() is that parameters() require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr'])
self.memory = memories['Memory']
self.update_flag = False
def sample(self, state):
'''选择动作
def sample_action(self, state):
''' sample action
'''
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
def predict(self, state):
'''选择动作
def predict_action(self, state):
''' predict action
'''
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
return action
def update(self):
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
return
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
else:
if not self.update_flag:
print("Begin to update!")
self.update_flag = True
# sample a batch of transitions from replay buffer
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
# convert to tensor
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
1) # 例如tensor([[1],...,[0]])
reward_batch = torch.tensor(
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device) # 将bool转为float然后转为张量
# 计算当前(s_t,a)对应的Q(s_t, a)
q_values = self.policy_net(state_batch)
next_q_values = self.policy_net(next_state_batch)
# 代入当前选择的action得到Q(s_t|a=a_t)
q_value = q_values.gather(dim=1, index=action_batch)
'''以下是Nature DQN的q_target计算方式
# 计算所有next states的Q'(s_{t+1})的最大值Q'为目标网络的q函数
next_q_state_value = self.target_net(
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
# 计算 q_target
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQN q_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
# 选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
# 优化模型
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
self.loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
# compute current Q(s_t|a=a_t)
q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
next_q_value_batch = self.policy_net(next_state_batch)
'''the following is the way of computing Double DQN expected_q_valuea bit different from Nature DQN'''
next_target_value_batch = self.target_net(next_state_batch)
# choose action a from Q(s_t, a), next_target_values obtain next_q_valuewhich is Q(s_t|a=argmax Q(s_t, a))
next_target_q_value_batch = next_target_value_batch.gather(1, torch.max(next_q_value_batch, 1)[1].unsqueeze(1)) # shape(batchsize,1)
expected_q_value_batch = reward_batch + self.gamma * next_target_q_value_batch * (1-done_batch)
loss = nn.MSELoss()(q_value_batch , expected_q_value_batch)
self.optimizer.zero_grad()
loss.backward()
# clip to avoid gradient explosion
for param in self.policy_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
self.optimizer.step()
def save(self,path):
def save_model(self,path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
def load(self,path):
def load_model(self,path):
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-11-07 18:10:37
LastEditor: JiangJi
LastEditTime: 2022-08-29 23:33:31
Discription:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import datetime
import argparse
from common.utils import all_seed
from common.models import MLP
from common.memories import ReplayBufferQue
from DoubleDQN.double_dqn import DoubleDQN
from common.launcher import Launcher
from envs.register import register_env
class Main(Launcher):
def get_args(self):
''' hyperparameters
'''
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'Qnet':MLP(n_states,n_actions,hidden_dim=cfg['hidden_dim'])}
memories = {'Memory':ReplayBufferQue(cfg['memory_capacity'])}
agent = DoubleDQN(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg["train_eps"]):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action = agent.sample_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push((state, action, reward, next_state, done))
state = next_state
agent.update()
if done:
break
if i_ep % cfg['target_update'] == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!")
env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
return res_dic
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = []
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -1 +0,0 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,145.0,0
1,166.0,0
2,171.0,0
3,200.0,0
4,139.0,0
5,200.0,0
6,200.0,0
7,141.0,0
8,200.0,0
9,187.0,0
10,166.0,0
11,172.0,0
12,121.0,0
13,200.0,0
14,200.0,0
15,149.0,0
16,128.0,0
17,200.0,0
18,178.0,0
19,185.0,0
1 episodes rewards steps
2 0 145.0 0
3 1 166.0 0
4 2 171.0 0
5 3 200.0 0
6 4 139.0 0
7 5 200.0 0
8 6 200.0 0
9 7 141.0 0
10 8 200.0 0
11 9 187.0 0
12 10 166.0 0
13 11 172.0 0
14 12 121.0 0
15 13 200.0 0
16 14 200.0 0
17 15 149.0 0
18 16 128.0 0
19 17 200.0 0
20 18 178.0 0
21 19 185.0 0

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards,steps
0,19.0,0
1,16.0,0
2,17.0,0
3,11.0,0
4,10.0,0
5,27.0,0
6,16.0,0
7,9.0,0
8,20.0,0
9,21.0,0
10,15.0,0
11,10.0,0
12,14.0,0
13,37.0,0
14,12.0,0
15,10.0,0
16,27.0,0
17,33.0,0
18,19.0,0
19,13.0,0
20,26.0,0
21,15.0,0
22,29.0,0
23,11.0,0
24,20.0,0
25,23.0,0
26,23.0,0
27,26.0,0
28,17.0,0
29,33.0,0
30,16.0,0
31,48.0,0
32,48.0,0
33,69.0,0
34,58.0,0
35,24.0,0
36,18.0,0
37,28.0,0
38,12.0,0
39,12.0,0
40,18.0,0
41,12.0,0
42,13.0,0
43,21.0,0
44,30.0,0
45,32.0,0
46,22.0,0
47,18.0,0
48,12.0,0
49,12.0,0
50,20.0,0
51,32.0,0
52,15.0,0
53,100.0,0
54,26.0,0
55,25.0,0
56,18.0,0
57,15.0,0
58,35.0,0
59,12.0,0
60,65.0,0
61,27.0,0
62,29.0,0
63,22.0,0
64,83.0,0
65,24.0,0
66,28.0,0
67,15.0,0
68,43.0,0
69,13.0,0
70,22.0,0
71,46.0,0
72,14.0,0
73,32.0,0
74,44.0,0
75,53.0,0
76,31.0,0
77,51.0,0
78,61.0,0
79,30.0,0
80,36.0,0
81,30.0,0
82,48.0,0
83,26.0,0
84,27.0,0
85,43.0,0
86,20.0,0
87,87.0,0
88,71.0,0
89,43.0,0
90,57.0,0
91,40.0,0
92,37.0,0
93,43.0,0
94,31.0,0
95,45.0,0
96,47.0,0
97,52.0,0
98,48.0,0
99,98.0,0
100,49.0,0
101,98.0,0
102,68.0,0
103,70.0,0
104,74.0,0
105,73.0,0
106,127.0,0
107,92.0,0
108,70.0,0
109,97.0,0
110,66.0,0
111,112.0,0
112,138.0,0
113,81.0,0
114,74.0,0
115,153.0,0
116,113.0,0
117,88.0,0
118,138.0,0
119,200.0,0
120,84.0,0
121,123.0,0
122,158.0,0
123,171.0,0
124,137.0,0
125,143.0,0
126,170.0,0
127,127.0,0
128,118.0,0
129,200.0,0
130,189.0,0
131,149.0,0
132,137.0,0
133,115.0,0
134,153.0,0
135,136.0,0
136,140.0,0
137,169.0,0
138,187.0,0
139,200.0,0
140,196.0,0
141,200.0,0
142,200.0,0
143,137.0,0
144,200.0,0
145,185.0,0
146,200.0,0
147,164.0,0
148,200.0,0
149,143.0,0
150,143.0,0
151,112.0,0
152,192.0,0
153,200.0,0
154,144.0,0
155,188.0,0
156,200.0,0
157,133.0,0
158,200.0,0
159,143.0,0
160,158.0,0
161,161.0,0
162,169.0,0
163,176.0,0
164,200.0,0
165,149.0,0
166,156.0,0
167,200.0,0
168,200.0,0
169,200.0,0
170,134.0,0
171,171.0,0
172,200.0,0
173,200.0,0
174,200.0,0
175,194.0,0
176,200.0,0
177,138.0,0
178,159.0,0
179,187.0,0
180,200.0,0
181,192.0,0
182,200.0,0
183,200.0,0
184,200.0,0
185,173.0,0
186,200.0,0
187,178.0,0
188,176.0,0
189,196.0,0
190,200.0,0
191,195.0,0
192,158.0,0
193,156.0,0
194,200.0,0
195,200.0,0
196,200.0,0
197,200.0,0
198,193.0,0
199,200.0,0
1 episodes rewards steps
2 0 19.0 0
3 1 16.0 0
4 2 17.0 0
5 3 11.0 0
6 4 10.0 0
7 5 27.0 0
8 6 16.0 0
9 7 9.0 0
10 8 20.0 0
11 9 21.0 0
12 10 15.0 0
13 11 10.0 0
14 12 14.0 0
15 13 37.0 0
16 14 12.0 0
17 15 10.0 0
18 16 27.0 0
19 17 33.0 0
20 18 19.0 0
21 19 13.0 0
22 20 26.0 0
23 21 15.0 0
24 22 29.0 0
25 23 11.0 0
26 24 20.0 0
27 25 23.0 0
28 26 23.0 0
29 27 26.0 0
30 28 17.0 0
31 29 33.0 0
32 30 16.0 0
33 31 48.0 0
34 32 48.0 0
35 33 69.0 0
36 34 58.0 0
37 35 24.0 0
38 36 18.0 0
39 37 28.0 0
40 38 12.0 0
41 39 12.0 0
42 40 18.0 0
43 41 12.0 0
44 42 13.0 0
45 43 21.0 0
46 44 30.0 0
47 45 32.0 0
48 46 22.0 0
49 47 18.0 0
50 48 12.0 0
51 49 12.0 0
52 50 20.0 0
53 51 32.0 0
54 52 15.0 0
55 53 100.0 0
56 54 26.0 0
57 55 25.0 0
58 56 18.0 0
59 57 15.0 0
60 58 35.0 0
61 59 12.0 0
62 60 65.0 0
63 61 27.0 0
64 62 29.0 0
65 63 22.0 0
66 64 83.0 0
67 65 24.0 0
68 66 28.0 0
69 67 15.0 0
70 68 43.0 0
71 69 13.0 0
72 70 22.0 0
73 71 46.0 0
74 72 14.0 0
75 73 32.0 0
76 74 44.0 0
77 75 53.0 0
78 76 31.0 0
79 77 51.0 0
80 78 61.0 0
81 79 30.0 0
82 80 36.0 0
83 81 30.0 0
84 82 48.0 0
85 83 26.0 0
86 84 27.0 0
87 85 43.0 0
88 86 20.0 0
89 87 87.0 0
90 88 71.0 0
91 89 43.0 0
92 90 57.0 0
93 91 40.0 0
94 92 37.0 0
95 93 43.0 0
96 94 31.0 0
97 95 45.0 0
98 96 47.0 0
99 97 52.0 0
100 98 48.0 0
101 99 98.0 0
102 100 49.0 0
103 101 98.0 0
104 102 68.0 0
105 103 70.0 0
106 104 74.0 0
107 105 73.0 0
108 106 127.0 0
109 107 92.0 0
110 108 70.0 0
111 109 97.0 0
112 110 66.0 0
113 111 112.0 0
114 112 138.0 0
115 113 81.0 0
116 114 74.0 0
117 115 153.0 0
118 116 113.0 0
119 117 88.0 0
120 118 138.0 0
121 119 200.0 0
122 120 84.0 0
123 121 123.0 0
124 122 158.0 0
125 123 171.0 0
126 124 137.0 0
127 125 143.0 0
128 126 170.0 0
129 127 127.0 0
130 128 118.0 0
131 129 200.0 0
132 130 189.0 0
133 131 149.0 0
134 132 137.0 0
135 133 115.0 0
136 134 153.0 0
137 135 136.0 0
138 136 140.0 0
139 137 169.0 0
140 138 187.0 0
141 139 200.0 0
142 140 196.0 0
143 141 200.0 0
144 142 200.0 0
145 143 137.0 0
146 144 200.0 0
147 145 185.0 0
148 146 200.0 0
149 147 164.0 0
150 148 200.0 0
151 149 143.0 0
152 150 143.0 0
153 151 112.0 0
154 152 192.0 0
155 153 200.0 0
156 154 144.0 0
157 155 188.0 0
158 156 200.0 0
159 157 133.0 0
160 158 200.0 0
161 159 143.0 0
162 160 158.0 0
163 161 161.0 0
164 162 169.0 0
165 163 176.0 0
166 164 200.0 0
167 165 149.0 0
168 166 156.0 0
169 167 200.0 0
170 168 200.0 0
171 169 200.0 0
172 170 134.0 0
173 171 171.0 0
174 172 200.0 0
175 173 200.0 0
176 174 200.0 0
177 175 194.0 0
178 176 200.0 0
179 177 138.0 0
180 178 159.0 0
181 179 187.0 0
182 180 200.0 0
183 181 192.0 0
184 182 200.0 0
185 183 200.0 0
186 184 200.0 0
187 185 173.0 0
188 186 200.0 0
189 187 178.0 0
190 188 176.0 0
191 189 196.0 0
192 190 200.0 0
193 191 195.0 0
194 192 158.0 0
195 193 156.0 0
196 194 200.0 0
197 195 200.0 0
198 196 200.0 0
199 197 200.0 0
200 198 193.0 0
201 199 200.0 0

View File

@@ -0,0 +1 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,200.0,0
1,200.0,0
2,200.0,0
3,200.0,0
4,191.0,0
5,200.0,0
6,200.0,0
7,179.0,0
8,200.0,0
9,200.0,0
10,200.0,0
11,190.0,0
12,147.0,0
13,197.0,0
14,200.0,0
15,200.0,0
16,167.0,0
17,200.0,0
18,200.0,0
19,200.0,0
1 episodes rewards steps
2 0 200.0 0
3 1 200.0 0
4 2 200.0 0
5 3 200.0 0
6 4 191.0 0
7 5 200.0 0
8 6 200.0 0
9 7 179.0 0
10 8 200.0 0
11 9 200.0 0
12 10 200.0 0
13 11 190.0 0
14 12 147.0 0
15 13 197.0 0
16 14 200.0 0
17 15 200.0 0
18 16 167.0 0
19 17 200.0 0
20 18 200.0 0
21 19 200.0 0

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards,steps
0,19.0,0
1,16.0,0
2,17.0,0
3,11.0,0
4,10.0,0
5,27.0,0
6,55.0,0
7,17.0,0
8,23.0,0
9,9.0,0
10,17.0,0
11,14.0,0
12,17.0,0
13,12.0,0
14,14.0,0
15,16.0,0
16,27.0,0
17,36.0,0
18,17.0,0
19,17.0,0
20,21.0,0
21,23.0,0
22,13.0,0
23,12.0,0
24,17.0,0
25,26.0,0
26,25.0,0
27,17.0,0
28,10.0,0
29,16.0,0
30,14.0,0
31,19.0,0
32,23.0,0
33,37.0,0
34,29.0,0
35,22.0,0
36,29.0,0
37,15.0,0
38,16.0,0
39,18.0,0
40,23.0,0
41,16.0,0
42,26.0,0
43,13.0,0
44,24.0,0
45,39.0,0
46,23.0,0
47,32.0,0
48,123.0,0
49,18.0,0
50,39.0,0
51,17.0,0
52,28.0,0
53,34.0,0
54,26.0,0
55,61.0,0
56,28.0,0
57,16.0,0
58,45.0,0
59,41.0,0
60,49.0,0
61,18.0,0
62,40.0,0
63,24.0,0
64,37.0,0
65,26.0,0
66,51.0,0
67,17.0,0
68,152.0,0
69,17.0,0
70,29.0,0
71,37.0,0
72,15.0,0
73,55.0,0
74,152.0,0
75,23.0,0
76,45.0,0
77,30.0,0
78,39.0,0
79,20.0,0
80,53.0,0
81,49.0,0
82,71.0,0
83,115.0,0
84,41.0,0
85,52.0,0
86,52.0,0
87,36.0,0
88,84.0,0
89,122.0,0
90,49.0,0
91,200.0,0
92,67.0,0
93,87.0,0
94,183.0,0
95,132.0,0
96,76.0,0
97,200.0,0
98,200.0,0
99,200.0,0
100,200.0,0
101,200.0,0
102,106.0,0
103,192.0,0
104,111.0,0
105,95.0,0
106,200.0,0
107,200.0,0
108,148.0,0
109,200.0,0
110,97.0,0
111,200.0,0
112,200.0,0
113,105.0,0
114,135.0,0
115,200.0,0
116,144.0,0
117,156.0,0
118,200.0,0
119,200.0,0
120,166.0,0
121,200.0,0
122,200.0,0
123,200.0,0
124,200.0,0
125,200.0,0
126,200.0,0
127,158.0,0
128,139.0,0
129,200.0,0
130,200.0,0
131,200.0,0
132,200.0,0
133,122.0,0
134,200.0,0
135,188.0,0
136,200.0,0
137,183.0,0
138,200.0,0
139,200.0,0
140,200.0,0
141,200.0,0
142,200.0,0
143,158.0,0
144,200.0,0
145,200.0,0
146,200.0,0
147,191.0,0
148,200.0,0
149,194.0,0
150,178.0,0
151,200.0,0
152,200.0,0
153,200.0,0
154,162.0,0
155,200.0,0
156,200.0,0
157,128.0,0
158,200.0,0
159,184.0,0
160,194.0,0
161,200.0,0
162,200.0,0
163,200.0,0
164,200.0,0
165,160.0,0
166,163.0,0
167,200.0,0
168,200.0,0
169,200.0,0
170,141.0,0
171,200.0,0
172,200.0,0
173,200.0,0
174,200.0,0
175,200.0,0
176,200.0,0
177,157.0,0
178,164.0,0
179,200.0,0
180,200.0,0
181,200.0,0
182,200.0,0
183,200.0,0
184,200.0,0
185,193.0,0
186,182.0,0
187,200.0,0
188,200.0,0
189,200.0,0
190,200.0,0
191,200.0,0
192,174.0,0
193,178.0,0
194,200.0,0
195,200.0,0
196,200.0,0
197,200.0,0
198,200.0,0
199,200.0,0
1 episodes rewards steps
2 0 19.0 0
3 1 16.0 0
4 2 17.0 0
5 3 11.0 0
6 4 10.0 0
7 5 27.0 0
8 6 55.0 0
9 7 17.0 0
10 8 23.0 0
11 9 9.0 0
12 10 17.0 0
13 11 14.0 0
14 12 17.0 0
15 13 12.0 0
16 14 14.0 0
17 15 16.0 0
18 16 27.0 0
19 17 36.0 0
20 18 17.0 0
21 19 17.0 0
22 20 21.0 0
23 21 23.0 0
24 22 13.0 0
25 23 12.0 0
26 24 17.0 0
27 25 26.0 0
28 26 25.0 0
29 27 17.0 0
30 28 10.0 0
31 29 16.0 0
32 30 14.0 0
33 31 19.0 0
34 32 23.0 0
35 33 37.0 0
36 34 29.0 0
37 35 22.0 0
38 36 29.0 0
39 37 15.0 0
40 38 16.0 0
41 39 18.0 0
42 40 23.0 0
43 41 16.0 0
44 42 26.0 0
45 43 13.0 0
46 44 24.0 0
47 45 39.0 0
48 46 23.0 0
49 47 32.0 0
50 48 123.0 0
51 49 18.0 0
52 50 39.0 0
53 51 17.0 0
54 52 28.0 0
55 53 34.0 0
56 54 26.0 0
57 55 61.0 0
58 56 28.0 0
59 57 16.0 0
60 58 45.0 0
61 59 41.0 0
62 60 49.0 0
63 61 18.0 0
64 62 40.0 0
65 63 24.0 0
66 64 37.0 0
67 65 26.0 0
68 66 51.0 0
69 67 17.0 0
70 68 152.0 0
71 69 17.0 0
72 70 29.0 0
73 71 37.0 0
74 72 15.0 0
75 73 55.0 0
76 74 152.0 0
77 75 23.0 0
78 76 45.0 0
79 77 30.0 0
80 78 39.0 0
81 79 20.0 0
82 80 53.0 0
83 81 49.0 0
84 82 71.0 0
85 83 115.0 0
86 84 41.0 0
87 85 52.0 0
88 86 52.0 0
89 87 36.0 0
90 88 84.0 0
91 89 122.0 0
92 90 49.0 0
93 91 200.0 0
94 92 67.0 0
95 93 87.0 0
96 94 183.0 0
97 95 132.0 0
98 96 76.0 0
99 97 200.0 0
100 98 200.0 0
101 99 200.0 0
102 100 200.0 0
103 101 200.0 0
104 102 106.0 0
105 103 192.0 0
106 104 111.0 0
107 105 95.0 0
108 106 200.0 0
109 107 200.0 0
110 108 148.0 0
111 109 200.0 0
112 110 97.0 0
113 111 200.0 0
114 112 200.0 0
115 113 105.0 0
116 114 135.0 0
117 115 200.0 0
118 116 144.0 0
119 117 156.0 0
120 118 200.0 0
121 119 200.0 0
122 120 166.0 0
123 121 200.0 0
124 122 200.0 0
125 123 200.0 0
126 124 200.0 0
127 125 200.0 0
128 126 200.0 0
129 127 158.0 0
130 128 139.0 0
131 129 200.0 0
132 130 200.0 0
133 131 200.0 0
134 132 200.0 0
135 133 122.0 0
136 134 200.0 0
137 135 188.0 0
138 136 200.0 0
139 137 183.0 0
140 138 200.0 0
141 139 200.0 0
142 140 200.0 0
143 141 200.0 0
144 142 200.0 0
145 143 158.0 0
146 144 200.0 0
147 145 200.0 0
148 146 200.0 0
149 147 191.0 0
150 148 200.0 0
151 149 194.0 0
152 150 178.0 0
153 151 200.0 0
154 152 200.0 0
155 153 200.0 0
156 154 162.0 0
157 155 200.0 0
158 156 200.0 0
159 157 128.0 0
160 158 200.0 0
161 159 184.0 0
162 160 194.0 0
163 161 200.0 0
164 162 200.0 0
165 163 200.0 0
166 164 200.0 0
167 165 160.0 0
168 166 163.0 0
169 167 200.0 0
170 168 200.0 0
171 169 200.0 0
172 170 141.0 0
173 171 200.0 0
174 172 200.0 0
175 173 200.0 0
176 174 200.0 0
177 175 200.0 0
178 176 200.0 0
179 177 157.0 0
180 178 164.0 0
181 179 200.0 0
182 180 200.0 0
183 181 200.0 0
184 182 200.0 0
185 183 200.0 0
186 184 200.0 0
187 185 193.0 0
188 186 182.0 0
189 187 200.0 0
190 188 200.0 0
191 189 200.0 0
192 190 200.0 0
193 191 200.0 0
194 192 174.0 0
195 193 178.0 0
196 194 200.0 0
197 195 200.0 0
198 196 200.0 0
199 197 200.0 0
200 198 200.0 0
201 199 200.0 0

View File

@@ -1,125 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-11-07 18:10:37
LastEditor: JiangJi
LastEditTime: 2022-07-21 21:52:31
Discription:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import torch
import datetime
import argparse
from common.utils import save_results,make_dir
from common.utils import plot_rewards,save_args
from common.models import MLP
from common.memories import ReplayBuffer
from DoubleDQN.double_dqn import DoubleDQN
def get_args():
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # 保存模型的路径
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
memory = ReplayBuffer(cfg.memory_capacity)
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
return env,agent
def train(cfg,env,agent):
print("开始训练!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.sample(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
state = next_state
agent.update()
if done:
break
if i_ep % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
if (i_ep+1)%10 == 0:
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}Epislon{agent.epsilon:.3f}')
rewards.append(ep_reward)
print("完成训练!")
return {'rewards':rewards}
def test(cfg,env,agent):
print("开始测试!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.predict(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
print("完成测试!")
return {'rewards':rewards}
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg,seed=1)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # 保存参数
agent.save(path=cfg.model_path) # 保存模型
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg,seed=1)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-08-27 00:04:08
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import torch
import datetime
import argparse
from itertools import count
import torch.nn.functional as F
from pg import PolicyGradient
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
from common.models import MLP
from common.memories import PGReplay
from common.launcher import Launcher
from envs.register import register_env
class PGNet(MLP):
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
'''
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = torch.sigmoid(self.fc3(x))
return x
class Main(Launcher):
def get_args(self):
""" Hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=0.01,type=float,help="learning rate")
parser.add_argument('--update_fre',default=8,type=int)
parser.add_argument('--hidden_dim',default=36,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg['seed'])
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n # action dimension
print(f"state dim: {n_states}, action dim: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
model = PGNet(n_states,1,hidden_dim=cfg['hidden_dim'])
memory = PGReplay()
agent = PolicyGradient(model,memory,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = []
for i_ep in range(cfg['train_eps']):
state = env.reset()
ep_reward = 0
for _ in range(cfg['ep_max_steps']):
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
agent.memory.push((state,float(action),reward))
state = next_state
if done:
break
if (i_ep+1) % 10 == 0:
print(f"Episode{i_ep+1}/{cfg['train_eps']}, Reward:{ep_reward:.2f}")
if (i_ep+1) % cfg['update_fre'] == 0:
agent.update()
rewards.append(ep_reward)
print('Finish training!')
env.close() # close environment
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
return res_dic
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = []
for i_ep in range(cfg['test_eps']):
state = env.reset()
ep_reward = 0
for _ in range(cfg['ep_max_steps']):
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
break
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
rewards.append(ep_reward)
print("Finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -1,16 +0,0 @@
{
"algo_name": "PolicyGradient",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.99,
"lr": 0.005,
"update_fre": 8,
"hidden_dim": 36,
"device": "cpu",
"seed": 1,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/models/",
"save_fig": true,
"show_fig": false
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

View File

@@ -1,201 +0,0 @@
episodes,rewards
0,26.0
1,53.0
2,10.0
3,37.0
4,22.0
5,21.0
6,12.0
7,34.0
8,38.0
9,40.0
10,23.0
11,14.0
12,16.0
13,25.0
14,15.0
15,23.0
16,11.0
17,28.0
18,21.0
19,62.0
20,33.0
21,27.0
22,15.0
23,17.0
24,26.0
25,35.0
26,26.0
27,14.0
28,42.0
29,45.0
30,34.0
31,39.0
32,31.0
33,17.0
34,42.0
35,41.0
36,31.0
37,39.0
38,28.0
39,12.0
40,36.0
41,33.0
42,47.0
43,40.0
44,63.0
45,36.0
46,64.0
47,79.0
48,49.0
49,40.0
50,65.0
51,47.0
52,51.0
53,30.0
54,26.0
55,41.0
56,86.0
57,61.0
58,38.0
59,200.0
60,49.0
61,70.0
62,61.0
63,101.0
64,200.0
65,152.0
66,108.0
67,46.0
68,72.0
69,87.0
70,27.0
71,126.0
72,46.0
73,25.0
74,14.0
75,42.0
76,38.0
77,55.0
78,42.0
79,51.0
80,67.0
81,83.0
82,178.0
83,115.0
84,140.0
85,97.0
86,85.0
87,61.0
88,153.0
89,200.0
90,200.0
91,200.0
92,200.0
93,64.0
94,200.0
95,200.0
96,157.0
97,128.0
98,160.0
99,35.0
100,140.0
101,113.0
102,200.0
103,154.0
104,200.0
105,200.0
106,200.0
107,198.0
108,137.0
109,200.0
110,200.0
111,102.0
112,200.0
113,200.0
114,200.0
115,200.0
116,148.0
117,200.0
118,200.0
119,200.0
120,200.0
121,200.0
122,194.0
123,200.0
124,200.0
125,200.0
126,183.0
127,200.0
128,200.0
129,200.0
130,200.0
131,200.0
132,200.0
133,200.0
134,200.0
135,200.0
136,93.0
137,96.0
138,84.0
139,103.0
140,79.0
141,104.0
142,82.0
143,105.0
144,200.0
145,200.0
146,171.0
147,200.0
148,200.0
149,200.0
150,200.0
151,197.0
152,133.0
153,142.0
154,147.0
155,156.0
156,131.0
157,181.0
158,163.0
159,146.0
160,200.0
161,176.0
162,200.0
163,173.0
164,177.0
165,200.0
166,200.0
167,200.0
168,200.0
169,200.0
170,200.0
171,200.0
172,200.0
173,200.0
174,200.0
175,200.0
176,200.0
177,200.0
178,200.0
179,200.0
180,200.0
181,200.0
182,200.0
183,200.0
184,200.0
185,200.0
186,200.0
187,200.0
188,200.0
189,200.0
190,200.0
191,200.0
192,200.0
193,200.0
194,200.0
195,200.0
196,190.0
197,200.0
198,189.0
199,200.0
1 episodes rewards
2 0 26.0
3 1 53.0
4 2 10.0
5 3 37.0
6 4 22.0
7 5 21.0
8 6 12.0
9 7 34.0
10 8 38.0
11 9 40.0
12 10 23.0
13 11 14.0
14 12 16.0
15 13 25.0
16 14 15.0
17 15 23.0
18 16 11.0
19 17 28.0
20 18 21.0
21 19 62.0
22 20 33.0
23 21 27.0
24 22 15.0
25 23 17.0
26 24 26.0
27 25 35.0
28 26 26.0
29 27 14.0
30 28 42.0
31 29 45.0
32 30 34.0
33 31 39.0
34 32 31.0
35 33 17.0
36 34 42.0
37 35 41.0
38 36 31.0
39 37 39.0
40 38 28.0
41 39 12.0
42 40 36.0
43 41 33.0
44 42 47.0
45 43 40.0
46 44 63.0
47 45 36.0
48 46 64.0
49 47 79.0
50 48 49.0
51 49 40.0
52 50 65.0
53 51 47.0
54 52 51.0
55 53 30.0
56 54 26.0
57 55 41.0
58 56 86.0
59 57 61.0
60 58 38.0
61 59 200.0
62 60 49.0
63 61 70.0
64 62 61.0
65 63 101.0
66 64 200.0
67 65 152.0
68 66 108.0
69 67 46.0
70 68 72.0
71 69 87.0
72 70 27.0
73 71 126.0
74 72 46.0
75 73 25.0
76 74 14.0
77 75 42.0
78 76 38.0
79 77 55.0
80 78 42.0
81 79 51.0
82 80 67.0
83 81 83.0
84 82 178.0
85 83 115.0
86 84 140.0
87 85 97.0
88 86 85.0
89 87 61.0
90 88 153.0
91 89 200.0
92 90 200.0
93 91 200.0
94 92 200.0
95 93 64.0
96 94 200.0
97 95 200.0
98 96 157.0
99 97 128.0
100 98 160.0
101 99 35.0
102 100 140.0
103 101 113.0
104 102 200.0
105 103 154.0
106 104 200.0
107 105 200.0
108 106 200.0
109 107 198.0
110 108 137.0
111 109 200.0
112 110 200.0
113 111 102.0
114 112 200.0
115 113 200.0
116 114 200.0
117 115 200.0
118 116 148.0
119 117 200.0
120 118 200.0
121 119 200.0
122 120 200.0
123 121 200.0
124 122 194.0
125 123 200.0
126 124 200.0
127 125 200.0
128 126 183.0
129 127 200.0
130 128 200.0
131 129 200.0
132 130 200.0
133 131 200.0
134 132 200.0
135 133 200.0
136 134 200.0
137 135 200.0
138 136 93.0
139 137 96.0
140 138 84.0
141 139 103.0
142 140 79.0
143 141 104.0
144 142 82.0
145 143 105.0
146 144 200.0
147 145 200.0
148 146 171.0
149 147 200.0
150 148 200.0
151 149 200.0
152 150 200.0
153 151 197.0
154 152 133.0
155 153 142.0
156 154 147.0
157 155 156.0
158 156 131.0
159 157 181.0
160 158 163.0
161 159 146.0
162 160 200.0
163 161 176.0
164 162 200.0
165 163 173.0
166 164 177.0
167 165 200.0
168 166 200.0
169 167 200.0
170 168 200.0
171 169 200.0
172 170 200.0
173 171 200.0
174 172 200.0
175 173 200.0
176 174 200.0
177 175 200.0
178 176 200.0
179 177 200.0
180 178 200.0
181 179 200.0
182 180 200.0
183 181 200.0
184 182 200.0
185 183 200.0
186 184 200.0
187 185 200.0
188 186 200.0
189 187 200.0
190 188 200.0
191 189 200.0
192 190 200.0
193 191 200.0
194 192 200.0
195 193 200.0
196 194 200.0
197 195 200.0
198 196 190.0
199 197 200.0
200 198 189.0
201 199 200.0

View File

@@ -0,0 +1 @@
{"algo_name": "PolicyGradient", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "lr": 0.01, "update_fre": 8, "hidden_dim": 36, "device": "cpu", "seed": 1, "save_fig": true, "show_fig": false, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\PolicyGradient/outputs/CartPole-v0/20220827-000433/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\PolicyGradient/outputs/CartPole-v0/20220827-000433/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

View File

@@ -1,7 +1,7 @@
episodes,rewards
0,200.0
1,200.0
2,165.0
2,200.0
3,200.0
4,200.0
5,200.0
@@ -10,12 +10,12 @@ episodes,rewards
8,200.0
9,200.0
10,200.0
11,168.0
11,200.0
12,200.0
13,200.0
14,200.0
15,115.0
16,198.0
15,200.0
16,200.0
17,200.0
18,200.0
19,200.0
1 episodes rewards
2 0 200.0
3 1 200.0
4 2 165.0 200.0
5 3 200.0
6 4 200.0
7 5 200.0
10 8 200.0
11 9 200.0
12 10 200.0
13 11 168.0 200.0
14 12 200.0
15 13 200.0
16 14 200.0
17 15 115.0 200.0
18 16 198.0 200.0
19 17 200.0
20 18 200.0
21 19 200.0

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards
0,26.0
1,53.0
2,10.0
3,37.0
4,22.0
5,21.0
6,12.0
7,34.0
8,93.0
9,36.0
10,29.0
11,18.0
12,14.0
13,62.0
14,20.0
15,40.0
16,10.0
17,10.0
18,10.0
19,11.0
20,10.0
21,14.0
22,12.0
23,8.0
24,19.0
25,33.0
26,22.0
27,32.0
28,16.0
29,24.0
30,24.0
31,24.0
32,75.0
33,33.0
34,33.0
35,72.0
36,110.0
37,48.0
38,60.0
39,43.0
40,61.0
41,34.0
42,50.0
43,61.0
44,53.0
45,58.0
46,36.0
47,44.0
48,42.0
49,64.0
50,67.0
51,52.0
52,39.0
53,42.0
54,40.0
55,33.0
56,200.0
57,199.0
58,149.0
59,185.0
60,134.0
61,174.0
62,162.0
63,200.0
64,93.0
65,72.0
66,69.0
67,51.0
68,62.0
69,98.0
70,73.0
71,73.0
72,200.0
73,200.0
74,200.0
75,200.0
76,200.0
77,200.0
78,200.0
79,133.0
80,200.0
81,200.0
82,200.0
83,200.0
84,200.0
85,200.0
86,200.0
87,200.0
88,114.0
89,151.0
90,129.0
91,156.0
92,112.0
93,172.0
94,171.0
95,141.0
96,200.0
97,200.0
98,200.0
99,200.0
100,200.0
101,200.0
102,200.0
103,200.0
104,188.0
105,199.0
106,138.0
107,200.0
108,200.0
109,181.0
110,145.0
111,200.0
112,135.0
113,119.0
114,112.0
115,122.0
116,118.0
117,119.0
118,131.0
119,119.0
120,109.0
121,96.0
122,105.0
123,29.0
124,110.0
125,113.0
126,18.0
127,90.0
128,145.0
129,152.0
130,151.0
131,109.0
132,141.0
133,109.0
134,136.0
135,143.0
136,200.0
137,200.0
138,200.0
139,200.0
140,200.0
141,200.0
142,200.0
143,200.0
144,192.0
145,173.0
146,180.0
147,182.0
148,186.0
149,175.0
150,176.0
151,191.0
152,200.0
153,200.0
154,200.0
155,200.0
156,200.0
157,200.0
158,200.0
159,200.0
160,200.0
161,200.0
162,200.0
163,200.0
164,200.0
165,200.0
166,200.0
167,200.0
168,200.0
169,200.0
170,200.0
171,200.0
172,200.0
173,200.0
174,200.0
175,200.0
176,200.0
177,200.0
178,200.0
179,200.0
180,200.0
181,200.0
182,200.0
183,200.0
184,200.0
185,200.0
186,200.0
187,200.0
188,200.0
189,200.0
190,200.0
191,200.0
192,200.0
193,200.0
194,200.0
195,200.0
196,200.0
197,200.0
198,200.0
199,200.0
1 episodes rewards
2 0 26.0
3 1 53.0
4 2 10.0
5 3 37.0
6 4 22.0
7 5 21.0
8 6 12.0
9 7 34.0
10 8 93.0
11 9 36.0
12 10 29.0
13 11 18.0
14 12 14.0
15 13 62.0
16 14 20.0
17 15 40.0
18 16 10.0
19 17 10.0
20 18 10.0
21 19 11.0
22 20 10.0
23 21 14.0
24 22 12.0
25 23 8.0
26 24 19.0
27 25 33.0
28 26 22.0
29 27 32.0
30 28 16.0
31 29 24.0
32 30 24.0
33 31 24.0
34 32 75.0
35 33 33.0
36 34 33.0
37 35 72.0
38 36 110.0
39 37 48.0
40 38 60.0
41 39 43.0
42 40 61.0
43 41 34.0
44 42 50.0
45 43 61.0
46 44 53.0
47 45 58.0
48 46 36.0
49 47 44.0
50 48 42.0
51 49 64.0
52 50 67.0
53 51 52.0
54 52 39.0
55 53 42.0
56 54 40.0
57 55 33.0
58 56 200.0
59 57 199.0
60 58 149.0
61 59 185.0
62 60 134.0
63 61 174.0
64 62 162.0
65 63 200.0
66 64 93.0
67 65 72.0
68 66 69.0
69 67 51.0
70 68 62.0
71 69 98.0
72 70 73.0
73 71 73.0
74 72 200.0
75 73 200.0
76 74 200.0
77 75 200.0
78 76 200.0
79 77 200.0
80 78 200.0
81 79 133.0
82 80 200.0
83 81 200.0
84 82 200.0
85 83 200.0
86 84 200.0
87 85 200.0
88 86 200.0
89 87 200.0
90 88 114.0
91 89 151.0
92 90 129.0
93 91 156.0
94 92 112.0
95 93 172.0
96 94 171.0
97 95 141.0
98 96 200.0
99 97 200.0
100 98 200.0
101 99 200.0
102 100 200.0
103 101 200.0
104 102 200.0
105 103 200.0
106 104 188.0
107 105 199.0
108 106 138.0
109 107 200.0
110 108 200.0
111 109 181.0
112 110 145.0
113 111 200.0
114 112 135.0
115 113 119.0
116 114 112.0
117 115 122.0
118 116 118.0
119 117 119.0
120 118 131.0
121 119 119.0
122 120 109.0
123 121 96.0
124 122 105.0
125 123 29.0
126 124 110.0
127 125 113.0
128 126 18.0
129 127 90.0
130 128 145.0
131 129 152.0
132 130 151.0
133 131 109.0
134 132 141.0
135 133 109.0
136 134 136.0
137 135 143.0
138 136 200.0
139 137 200.0
140 138 200.0
141 139 200.0
142 140 200.0
143 141 200.0
144 142 200.0
145 143 200.0
146 144 192.0
147 145 173.0
148 146 180.0
149 147 182.0
150 148 186.0
151 149 175.0
152 150 176.0
153 151 191.0
154 152 200.0
155 153 200.0
156 154 200.0
157 155 200.0
158 156 200.0
159 157 200.0
160 158 200.0
161 159 200.0
162 160 200.0
163 161 200.0
164 162 200.0
165 163 200.0
166 164 200.0
167 165 200.0
168 166 200.0
169 167 200.0
170 168 200.0
171 169 200.0
172 170 200.0
173 171 200.0
174 172 200.0
175 173 200.0
176 174 200.0
177 175 200.0
178 176 200.0
179 177 200.0
180 178 200.0
181 179 200.0
182 180 200.0
183 181 200.0
184 182 200.0
185 183 200.0
186 184 200.0
187 185 200.0
188 186 200.0
189 187 200.0
190 188 200.0
191 189 200.0
192 190 200.0
193 191 200.0
194 192 200.0
195 193 200.0
196 194 200.0
197 195 200.0
198 196 200.0
199 197 200.0
200 198 200.0
201 199 200.0

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44
LastEditor: John
LastEditTime: 2022-08-22 17:35:34
LastEditTime: 2022-08-27 13:45:26
Discription:
Environment:
'''
@@ -19,20 +19,23 @@ import numpy as np
class PolicyGradient:
def __init__(self, n_states,model,memory,cfg):
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
def __init__(self, model,memory,cfg):
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.memory = memory
self.policy_net = model.to(self.device)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg['lr'])
def sample_action(self,state):
state = torch.from_numpy(state).float()
state = Variable(state)
probs = self.policy_net(state)
print("probs")
print(probs)
m = Bernoulli(probs) # 伯努利分布
action = m.sample()
action = action.data.numpy().astype(int)[0] # 转为标量
return action
def predict_action(self,state):

View File

@@ -1,139 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-08-22 17:40:07
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import gym
import torch
import datetime
import argparse
from itertools import count
import torch.nn.functional as F
from pg import PolicyGradient
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
from common.models import MLP
from common.memories import PGReplay
def get_args():
""" Hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=0.005,type=float,help="learning rate")
parser.add_argument('--update_fre',default=8,type=int)
parser.add_argument('--hidden_dim',default=36,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
args = parser.parse_args([])
return args
class PGNet(MLP):
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
'''
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
def env_agent_config(cfg):
env = gym.make(cfg.env_name)
if cfg.seed !=0: # set random seed
all_seed(env,seed=cfg.seed)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n # action dimension
print(f"state dim: {n_states}, action dim: {n_actions}")
model = PGNet(n_states,1,hidden_dim=cfg.hidden_dim)
memory = PGReplay()
agent = PolicyGradient(n_states,model,memory,cfg)
return env,agent
def train(cfg,env,agent):
print('Start training!')
print(f'Env:{cfg.env_name}, Algo:{cfg.algo_name}, Device:{cfg.device}')
rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
agent.memory.push((state,float(action),reward))
state = next_state
if done:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
break
if (i_ep+1) % cfg.update_fre == 0:
agent.update()
rewards.append(ep_reward)
print('Finish training!')
env.close() # close environment
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
return res_dic
def test(cfg,env,agent):
print("start testing!")
print(f"Env: {cfg.env_name}, Algo: {cfg.algo_name}, Device: {cfg.device}")
rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print(f'Episode: {i_ep+1}/{cfg.test_eps}Reward: {ep_reward:.2f}')
break
rewards.append(ep_reward)
print("finish testing!")
env.close()
return {'episodes':range(len(rewards)),'rewards':rewards}
if __name__ == "__main__":
cfg = get_args()
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg.result_path) # save parameters
agent.save_model(path = cfg.model_path) # save models
save_results(res_dic, tag = 'train', path = cfg.result_path) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg.model_path) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test")

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2022-08-24 11:27:01
LastEditTime: 2022-08-26 22:46:21
Discription:
Environment:
'''
@@ -18,136 +18,105 @@ sys.path.append(parent_path) # add path to system path
import gym
import datetime
import argparse
from envs.gridworld_env import CliffWalkingWapper,FrozenLakeWapper
from envs.gridworld_env import FrozenLakeWapper
from envs.wrappers import CliffWalkingWapper
from envs.register import register_env
from qlearning import QLearning
from common.utils import plot_rewards,save_args,all_seed
from common.utils import save_results,make_dir
def get_args():
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(cfg):
''' create env and agent
'''
if cfg['env_name'] == 'CliffWalking-v0':
env = gym.make(cfg['env_name'])
env = CliffWalkingWapper(env)
if cfg['env_name'] == 'FrozenLake-v1':
env = gym.make(cfg['env_name'],is_slippery=False)
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
n_states = env.observation_space.n # state dimension
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
agent = QLearning(cfg)
return env,agent
def main(cfg,env,agent,tag = 'train'):
print(f"Start {tag}ing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合
while True:
if tag == 'train':action = agent.sample_action(state) # 根据算法采样一个动作
else: agent.predict_action(state)
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
if tag == 'train':agent.update(state, action, reward, next_state, done) # Q学习算法更新
state = next_state # 更新状态
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print(f"Finish {tag}ing!")
return {"rewards":rewards}
def train(cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
while True:
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.update(state, action, reward, next_state, done) # update agent
state = next_state # update state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
while True:
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
from common.utils import all_seed
from common.launcher import Launcher
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['env_name'] == 'CliffWalking-v0':
env = CliffWalkingWapper(env)
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
agent = QLearning(cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
while True:
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
agent.update(state, action, reward, next_state, done) # update agent
state = next_state # update state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
while True:
action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
cfg = get_args()
# training
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg['result_path']) # save parameters
agent.save_model(path = cfg['model_path']) # save models
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg['model_path']) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg['result_path'])
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
main = Main()
main.run()

View File

@@ -1,21 +0,0 @@
episodes,rewards
0,-13
1,-13
2,-13
3,-13
4,-13
5,-13
6,-13
7,-13
8,-13
9,-13
10,-13
11,-13
12,-13
13,-13
14,-13
15,-13
16,-13
17,-13
18,-13
19,-13
1 episodes rewards
2 0 -13
3 1 -13
4 2 -13
5 3 -13
6 4 -13
7 5 -13
8 6 -13
9 7 -13
10 8 -13
11 9 -13
12 10 -13
13 11 -13
14 12 -13
15 13 -13
16 14 -13
17 15 -13
18 16 -13
19 17 -13
20 18 -13
21 19 -13

View File

@@ -1,401 +0,0 @@
episodes,rewards
0,-2131
1,-1086
2,-586
3,-220
4,-154
5,-122
6,-150
7,-159
8,-164
9,-88
10,-195
11,-114
12,-60
13,-179
14,-101
15,-304
16,-96
17,-119
18,-113
19,-98
20,-106
21,-105
22,-77
23,-51
24,-105
25,-136
26,-100
27,-29
28,-79
29,-114
30,-82
31,-70
32,-75
33,-51
34,-94
35,-52
36,-93
37,-71
38,-73
39,-48
40,-52
41,-96
42,-46
43,-65
44,-57
45,-41
46,-104
47,-51
48,-181
49,-229
50,-39
51,-69
52,-53
53,-59
54,-26
55,-75
56,-31
57,-60
58,-63
59,-40
60,-35
61,-79
62,-42
63,-22
64,-73
65,-71
66,-18
67,-55
68,-29
69,-43
70,-70
71,-49
72,-42
73,-29
74,-81
75,-36
76,-38
77,-36
78,-52
79,-28
80,-42
81,-52
82,-66
83,-31
84,-27
85,-49
86,-28
87,-54
88,-34
89,-35
90,-50
91,-36
92,-36
93,-46
94,-34
95,-135
96,-39
97,-36
98,-26
99,-56
100,-40
101,-40
102,-26
103,-28
104,-31
105,-35
106,-26
107,-57
108,-44
109,-41
110,-31
111,-26
112,-25
113,-41
114,-32
115,-44
116,-30
117,-32
118,-30
119,-25
120,-23
121,-47
122,-24
123,-45
124,-39
125,-21
126,-43
127,-143
128,-26
129,-20
130,-32
131,-16
132,-24
133,-42
134,-25
135,-36
136,-19
137,-29
138,-43
139,-17
140,-150
141,-32
142,-34
143,-19
144,-26
145,-30
146,-31
147,-49
148,-33
149,-21
150,-17
151,-48
152,-34
153,-20
154,-20
155,-26
156,-21
157,-13
158,-40
159,-22
160,-26
161,-30
162,-29
163,-25
164,-26
165,-27
166,-21
167,-29
168,-24
169,-17
170,-22
171,-35
172,-35
173,-18
174,-135
175,-15
176,-23
177,-28
178,-25
179,-24
180,-29
181,-31
182,-24
183,-129
184,-45
185,-24
186,-17
187,-20
188,-21
189,-23
190,-15
191,-32
192,-22
193,-19
194,-17
195,-45
196,-15
197,-14
198,-14
199,-37
200,-23
201,-17
202,-19
203,-21
204,-23
205,-27
206,-14
207,-18
208,-23
209,-34
210,-23
211,-13
212,-25
213,-17
214,-13
215,-21
216,-29
217,-18
218,-24
219,-15
220,-27
221,-25
222,-21
223,-19
224,-17
225,-18
226,-13
227,-22
228,-14
229,-13
230,-29
231,-23
232,-15
233,-15
234,-14
235,-28
236,-25
237,-17
238,-23
239,-29
240,-15
241,-14
242,-15
243,-23
244,-15
245,-16
246,-19
247,-13
248,-16
249,-17
250,-25
251,-30
252,-13
253,-14
254,-15
255,-22
256,-14
257,-17
258,-126
259,-15
260,-21
261,-16
262,-23
263,-14
264,-13
265,-13
266,-19
267,-13
268,-19
269,-17
270,-17
271,-13
272,-19
273,-13
274,-13
275,-16
276,-22
277,-14
278,-15
279,-19
280,-34
281,-13
282,-15
283,-32
284,-13
285,-13
286,-13
287,-14
288,-16
289,-13
290,-13
291,-17
292,-13
293,-13
294,-22
295,-14
296,-15
297,-13
298,-13
299,-13
300,-16
301,-13
302,-14
303,-13
304,-13
305,-13
306,-24
307,-13
308,-13
309,-15
310,-13
311,-13
312,-13
313,-15
314,-13
315,-19
316,-15
317,-17
318,-13
319,-13
320,-13
321,-13
322,-13
323,-15
324,-13
325,-13
326,-13
327,-123
328,-13
329,-13
330,-13
331,-13
332,-13
333,-13
334,-13
335,-13
336,-16
337,-13
338,-23
339,-13
340,-13
341,-13
342,-13
343,-13
344,-13
345,-13
346,-13
347,-13
348,-13
349,-13
350,-134
351,-13
352,-13
353,-13
354,-13
355,-13
356,-13
357,-13
358,-13
359,-13
360,-15
361,-13
362,-13
363,-13
364,-13
365,-13
366,-13
367,-13
368,-13
369,-14
370,-13
371,-13
372,-13
373,-13
374,-13
375,-13
376,-13
377,-124
378,-13
379,-13
380,-13
381,-13
382,-13
383,-13
384,-13
385,-13
386,-13
387,-13
388,-13
389,-121
390,-13
391,-13
392,-13
393,-13
394,-13
395,-13
396,-13
397,-13
398,-17
399,-13
1 episodes rewards
2 0 -2131
3 1 -1086
4 2 -586
5 3 -220
6 4 -154
7 5 -122
8 6 -150
9 7 -159
10 8 -164
11 9 -88
12 10 -195
13 11 -114
14 12 -60
15 13 -179
16 14 -101
17 15 -304
18 16 -96
19 17 -119
20 18 -113
21 19 -98
22 20 -106
23 21 -105
24 22 -77
25 23 -51
26 24 -105
27 25 -136
28 26 -100
29 27 -29
30 28 -79
31 29 -114
32 30 -82
33 31 -70
34 32 -75
35 33 -51
36 34 -94
37 35 -52
38 36 -93
39 37 -71
40 38 -73
41 39 -48
42 40 -52
43 41 -96
44 42 -46
45 43 -65
46 44 -57
47 45 -41
48 46 -104
49 47 -51
50 48 -181
51 49 -229
52 50 -39
53 51 -69
54 52 -53
55 53 -59
56 54 -26
57 55 -75
58 56 -31
59 57 -60
60 58 -63
61 59 -40
62 60 -35
63 61 -79
64 62 -42
65 63 -22
66 64 -73
67 65 -71
68 66 -18
69 67 -55
70 68 -29
71 69 -43
72 70 -70
73 71 -49
74 72 -42
75 73 -29
76 74 -81
77 75 -36
78 76 -38
79 77 -36
80 78 -52
81 79 -28
82 80 -42
83 81 -52
84 82 -66
85 83 -31
86 84 -27
87 85 -49
88 86 -28
89 87 -54
90 88 -34
91 89 -35
92 90 -50
93 91 -36
94 92 -36
95 93 -46
96 94 -34
97 95 -135
98 96 -39
99 97 -36
100 98 -26
101 99 -56
102 100 -40
103 101 -40
104 102 -26
105 103 -28
106 104 -31
107 105 -35
108 106 -26
109 107 -57
110 108 -44
111 109 -41
112 110 -31
113 111 -26
114 112 -25
115 113 -41
116 114 -32
117 115 -44
118 116 -30
119 117 -32
120 118 -30
121 119 -25
122 120 -23
123 121 -47
124 122 -24
125 123 -45
126 124 -39
127 125 -21
128 126 -43
129 127 -143
130 128 -26
131 129 -20
132 130 -32
133 131 -16
134 132 -24
135 133 -42
136 134 -25
137 135 -36
138 136 -19
139 137 -29
140 138 -43
141 139 -17
142 140 -150
143 141 -32
144 142 -34
145 143 -19
146 144 -26
147 145 -30
148 146 -31
149 147 -49
150 148 -33
151 149 -21
152 150 -17
153 151 -48
154 152 -34
155 153 -20
156 154 -20
157 155 -26
158 156 -21
159 157 -13
160 158 -40
161 159 -22
162 160 -26
163 161 -30
164 162 -29
165 163 -25
166 164 -26
167 165 -27
168 166 -21
169 167 -29
170 168 -24
171 169 -17
172 170 -22
173 171 -35
174 172 -35
175 173 -18
176 174 -135
177 175 -15
178 176 -23
179 177 -28
180 178 -25
181 179 -24
182 180 -29
183 181 -31
184 182 -24
185 183 -129
186 184 -45
187 185 -24
188 186 -17
189 187 -20
190 188 -21
191 189 -23
192 190 -15
193 191 -32
194 192 -22
195 193 -19
196 194 -17
197 195 -45
198 196 -15
199 197 -14
200 198 -14
201 199 -37
202 200 -23
203 201 -17
204 202 -19
205 203 -21
206 204 -23
207 205 -27
208 206 -14
209 207 -18
210 208 -23
211 209 -34
212 210 -23
213 211 -13
214 212 -25
215 213 -17
216 214 -13
217 215 -21
218 216 -29
219 217 -18
220 218 -24
221 219 -15
222 220 -27
223 221 -25
224 222 -21
225 223 -19
226 224 -17
227 225 -18
228 226 -13
229 227 -22
230 228 -14
231 229 -13
232 230 -29
233 231 -23
234 232 -15
235 233 -15
236 234 -14
237 235 -28
238 236 -25
239 237 -17
240 238 -23
241 239 -29
242 240 -15
243 241 -14
244 242 -15
245 243 -23
246 244 -15
247 245 -16
248 246 -19
249 247 -13
250 248 -16
251 249 -17
252 250 -25
253 251 -30
254 252 -13
255 253 -14
256 254 -15
257 255 -22
258 256 -14
259 257 -17
260 258 -126
261 259 -15
262 260 -21
263 261 -16
264 262 -23
265 263 -14
266 264 -13
267 265 -13
268 266 -19
269 267 -13
270 268 -19
271 269 -17
272 270 -17
273 271 -13
274 272 -19
275 273 -13
276 274 -13
277 275 -16
278 276 -22
279 277 -14
280 278 -15
281 279 -19
282 280 -34
283 281 -13
284 282 -15
285 283 -32
286 284 -13
287 285 -13
288 286 -13
289 287 -14
290 288 -16
291 289 -13
292 290 -13
293 291 -17
294 292 -13
295 293 -13
296 294 -22
297 295 -14
298 296 -15
299 297 -13
300 298 -13
301 299 -13
302 300 -16
303 301 -13
304 302 -14
305 303 -13
306 304 -13
307 305 -13
308 306 -24
309 307 -13
310 308 -13
311 309 -15
312 310 -13
313 311 -13
314 312 -13
315 313 -15
316 314 -13
317 315 -19
318 316 -15
319 317 -17
320 318 -13
321 319 -13
322 320 -13
323 321 -13
324 322 -13
325 323 -15
326 324 -13
327 325 -13
328 326 -13
329 327 -123
330 328 -13
331 329 -13
332 330 -13
333 331 -13
334 332 -13
335 333 -13
336 334 -13
337 335 -13
338 336 -16
339 337 -13
340 338 -23
341 339 -13
342 340 -13
343 341 -13
344 342 -13
345 343 -13
346 344 -13
347 345 -13
348 346 -13
349 347 -13
350 348 -13
351 349 -13
352 350 -134
353 351 -13
354 352 -13
355 353 -13
356 354 -13
357 355 -13
358 356 -13
359 357 -13
360 358 -13
361 359 -13
362 360 -15
363 361 -13
364 362 -13
365 363 -13
366 364 -13
367 365 -13
368 366 -13
369 367 -13
370 368 -13
371 369 -14
372 370 -13
373 371 -13
374 372 -13
375 373 -13
376 374 -13
377 375 -13
378 376 -13
379 377 -124
380 378 -13
381 379 -13
382 380 -13
383 381 -13
384 382 -13
385 383 -13
386 384 -13
387 385 -13
388 386 -13
389 387 -13
390 388 -13
391 389 -121
392 390 -13
393 391 -13
394 392 -13
395 393 -13
396 394 -13
397 395 -13
398 396 -13
399 397 -13
400 398 -17
401 399 -13

View File

@@ -0,0 +1 @@
{"algo_name": "Q-learning", "env_name": "CliffWalking-v0", "train_eps": 400, "test_eps": 20, "gamma": 0.9, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 300, "lr": 0.1, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/CliffWalking-v0/20220826-224730/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/CliffWalking-v0/20220826-224730/models/", "n_states": 48, "n_actions": 4}

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,-13,13
1,-13,13
2,-13,13
3,-13,13
4,-13,13
5,-13,13
6,-13,13
7,-13,13
8,-13,13
9,-13,13
10,-13,13
11,-13,13
12,-13,13
13,-13,13
14,-13,13
15,-13,13
16,-13,13
17,-13,13
18,-13,13
19,-13,13
1 episodes rewards steps
2 0 -13 13
3 1 -13 13
4 2 -13 13
5 3 -13 13
6 4 -13 13
7 5 -13 13
8 6 -13 13
9 7 -13 13
10 8 -13 13
11 9 -13 13
12 10 -13 13
13 11 -13 13
14 12 -13 13
15 13 -13 13
16 14 -13 13
17 15 -13 13
18 16 -13 13
19 17 -13 13
20 18 -13 13
21 19 -13 13

View File

@@ -0,0 +1,401 @@
episodes,rewards,steps
0,-2131,448
1,-1086,492
2,-586,388
3,-220,220
4,-154,154
5,-122,122
6,-150,150
7,-159,159
8,-164,164
9,-88,88
10,-195,195
11,-114,114
12,-60,60
13,-179,179
14,-101,101
15,-304,205
16,-96,96
17,-119,119
18,-113,113
19,-98,98
20,-106,106
21,-105,105
22,-77,77
23,-51,51
24,-105,105
25,-136,136
26,-100,100
27,-29,29
28,-79,79
29,-114,114
30,-82,82
31,-70,70
32,-75,75
33,-51,51
34,-94,94
35,-52,52
36,-93,93
37,-71,71
38,-73,73
39,-48,48
40,-52,52
41,-96,96
42,-46,46
43,-65,65
44,-57,57
45,-41,41
46,-104,104
47,-51,51
48,-181,82
49,-229,130
50,-39,39
51,-69,69
52,-53,53
53,-59,59
54,-26,26
55,-75,75
56,-31,31
57,-60,60
58,-63,63
59,-40,40
60,-35,35
61,-79,79
62,-42,42
63,-22,22
64,-73,73
65,-71,71
66,-18,18
67,-55,55
68,-29,29
69,-43,43
70,-70,70
71,-49,49
72,-42,42
73,-29,29
74,-81,81
75,-36,36
76,-38,38
77,-36,36
78,-52,52
79,-28,28
80,-42,42
81,-52,52
82,-66,66
83,-31,31
84,-27,27
85,-49,49
86,-28,28
87,-54,54
88,-34,34
89,-35,35
90,-50,50
91,-36,36
92,-36,36
93,-46,46
94,-34,34
95,-135,36
96,-39,39
97,-36,36
98,-26,26
99,-56,56
100,-40,40
101,-40,40
102,-26,26
103,-28,28
104,-31,31
105,-35,35
106,-26,26
107,-57,57
108,-44,44
109,-41,41
110,-31,31
111,-26,26
112,-25,25
113,-41,41
114,-32,32
115,-44,44
116,-30,30
117,-32,32
118,-30,30
119,-25,25
120,-23,23
121,-47,47
122,-24,24
123,-45,45
124,-39,39
125,-21,21
126,-43,43
127,-143,44
128,-26,26
129,-20,20
130,-32,32
131,-16,16
132,-24,24
133,-42,42
134,-25,25
135,-36,36
136,-19,19
137,-29,29
138,-43,43
139,-17,17
140,-150,51
141,-32,32
142,-34,34
143,-19,19
144,-26,26
145,-30,30
146,-31,31
147,-49,49
148,-33,33
149,-21,21
150,-17,17
151,-48,48
152,-34,34
153,-20,20
154,-20,20
155,-26,26
156,-21,21
157,-13,13
158,-40,40
159,-22,22
160,-26,26
161,-30,30
162,-29,29
163,-25,25
164,-26,26
165,-27,27
166,-21,21
167,-29,29
168,-24,24
169,-17,17
170,-22,22
171,-35,35
172,-35,35
173,-18,18
174,-135,36
175,-15,15
176,-23,23
177,-28,28
178,-25,25
179,-24,24
180,-29,29
181,-31,31
182,-24,24
183,-129,30
184,-45,45
185,-24,24
186,-17,17
187,-20,20
188,-21,21
189,-23,23
190,-15,15
191,-32,32
192,-22,22
193,-19,19
194,-17,17
195,-45,45
196,-15,15
197,-14,14
198,-14,14
199,-37,37
200,-23,23
201,-17,17
202,-19,19
203,-21,21
204,-23,23
205,-27,27
206,-14,14
207,-18,18
208,-23,23
209,-34,34
210,-23,23
211,-13,13
212,-25,25
213,-17,17
214,-13,13
215,-21,21
216,-29,29
217,-18,18
218,-24,24
219,-15,15
220,-27,27
221,-25,25
222,-21,21
223,-19,19
224,-17,17
225,-18,18
226,-13,13
227,-22,22
228,-14,14
229,-13,13
230,-29,29
231,-23,23
232,-15,15
233,-15,15
234,-14,14
235,-28,28
236,-25,25
237,-17,17
238,-23,23
239,-29,29
240,-15,15
241,-14,14
242,-15,15
243,-23,23
244,-15,15
245,-16,16
246,-19,19
247,-13,13
248,-16,16
249,-17,17
250,-25,25
251,-30,30
252,-13,13
253,-14,14
254,-15,15
255,-22,22
256,-14,14
257,-17,17
258,-126,27
259,-15,15
260,-21,21
261,-16,16
262,-23,23
263,-14,14
264,-13,13
265,-13,13
266,-19,19
267,-13,13
268,-19,19
269,-17,17
270,-17,17
271,-13,13
272,-19,19
273,-13,13
274,-13,13
275,-16,16
276,-22,22
277,-14,14
278,-15,15
279,-19,19
280,-34,34
281,-13,13
282,-15,15
283,-32,32
284,-13,13
285,-13,13
286,-13,13
287,-14,14
288,-16,16
289,-13,13
290,-13,13
291,-17,17
292,-13,13
293,-13,13
294,-22,22
295,-14,14
296,-15,15
297,-13,13
298,-13,13
299,-13,13
300,-16,16
301,-13,13
302,-14,14
303,-13,13
304,-13,13
305,-13,13
306,-24,24
307,-13,13
308,-13,13
309,-15,15
310,-13,13
311,-13,13
312,-13,13
313,-15,15
314,-13,13
315,-19,19
316,-15,15
317,-17,17
318,-13,13
319,-13,13
320,-13,13
321,-13,13
322,-13,13
323,-15,15
324,-13,13
325,-13,13
326,-13,13
327,-123,24
328,-13,13
329,-13,13
330,-13,13
331,-13,13
332,-13,13
333,-13,13
334,-13,13
335,-13,13
336,-16,16
337,-13,13
338,-23,23
339,-13,13
340,-13,13
341,-13,13
342,-13,13
343,-13,13
344,-13,13
345,-13,13
346,-13,13
347,-13,13
348,-13,13
349,-13,13
350,-134,35
351,-13,13
352,-13,13
353,-13,13
354,-13,13
355,-13,13
356,-13,13
357,-13,13
358,-13,13
359,-13,13
360,-15,15
361,-13,13
362,-13,13
363,-13,13
364,-13,13
365,-13,13
366,-13,13
367,-13,13
368,-13,13
369,-14,14
370,-13,13
371,-13,13
372,-13,13
373,-13,13
374,-13,13
375,-13,13
376,-13,13
377,-124,25
378,-13,13
379,-13,13
380,-13,13
381,-13,13
382,-13,13
383,-13,13
384,-13,13
385,-13,13
386,-13,13
387,-13,13
388,-13,13
389,-121,22
390,-13,13
391,-13,13
392,-13,13
393,-13,13
394,-13,13
395,-13,13
396,-13,13
397,-13,13
398,-17,17
399,-13,13
1 episodes rewards steps
2 0 -2131 448
3 1 -1086 492
4 2 -586 388
5 3 -220 220
6 4 -154 154
7 5 -122 122
8 6 -150 150
9 7 -159 159
10 8 -164 164
11 9 -88 88
12 10 -195 195
13 11 -114 114
14 12 -60 60
15 13 -179 179
16 14 -101 101
17 15 -304 205
18 16 -96 96
19 17 -119 119
20 18 -113 113
21 19 -98 98
22 20 -106 106
23 21 -105 105
24 22 -77 77
25 23 -51 51
26 24 -105 105
27 25 -136 136
28 26 -100 100
29 27 -29 29
30 28 -79 79
31 29 -114 114
32 30 -82 82
33 31 -70 70
34 32 -75 75
35 33 -51 51
36 34 -94 94
37 35 -52 52
38 36 -93 93
39 37 -71 71
40 38 -73 73
41 39 -48 48
42 40 -52 52
43 41 -96 96
44 42 -46 46
45 43 -65 65
46 44 -57 57
47 45 -41 41
48 46 -104 104
49 47 -51 51
50 48 -181 82
51 49 -229 130
52 50 -39 39
53 51 -69 69
54 52 -53 53
55 53 -59 59
56 54 -26 26
57 55 -75 75
58 56 -31 31
59 57 -60 60
60 58 -63 63
61 59 -40 40
62 60 -35 35
63 61 -79 79
64 62 -42 42
65 63 -22 22
66 64 -73 73
67 65 -71 71
68 66 -18 18
69 67 -55 55
70 68 -29 29
71 69 -43 43
72 70 -70 70
73 71 -49 49
74 72 -42 42
75 73 -29 29
76 74 -81 81
77 75 -36 36
78 76 -38 38
79 77 -36 36
80 78 -52 52
81 79 -28 28
82 80 -42 42
83 81 -52 52
84 82 -66 66
85 83 -31 31
86 84 -27 27
87 85 -49 49
88 86 -28 28
89 87 -54 54
90 88 -34 34
91 89 -35 35
92 90 -50 50
93 91 -36 36
94 92 -36 36
95 93 -46 46
96 94 -34 34
97 95 -135 36
98 96 -39 39
99 97 -36 36
100 98 -26 26
101 99 -56 56
102 100 -40 40
103 101 -40 40
104 102 -26 26
105 103 -28 28
106 104 -31 31
107 105 -35 35
108 106 -26 26
109 107 -57 57
110 108 -44 44
111 109 -41 41
112 110 -31 31
113 111 -26 26
114 112 -25 25
115 113 -41 41
116 114 -32 32
117 115 -44 44
118 116 -30 30
119 117 -32 32
120 118 -30 30
121 119 -25 25
122 120 -23 23
123 121 -47 47
124 122 -24 24
125 123 -45 45
126 124 -39 39
127 125 -21 21
128 126 -43 43
129 127 -143 44
130 128 -26 26
131 129 -20 20
132 130 -32 32
133 131 -16 16
134 132 -24 24
135 133 -42 42
136 134 -25 25
137 135 -36 36
138 136 -19 19
139 137 -29 29
140 138 -43 43
141 139 -17 17
142 140 -150 51
143 141 -32 32
144 142 -34 34
145 143 -19 19
146 144 -26 26
147 145 -30 30
148 146 -31 31
149 147 -49 49
150 148 -33 33
151 149 -21 21
152 150 -17 17
153 151 -48 48
154 152 -34 34
155 153 -20 20
156 154 -20 20
157 155 -26 26
158 156 -21 21
159 157 -13 13
160 158 -40 40
161 159 -22 22
162 160 -26 26
163 161 -30 30
164 162 -29 29
165 163 -25 25
166 164 -26 26
167 165 -27 27
168 166 -21 21
169 167 -29 29
170 168 -24 24
171 169 -17 17
172 170 -22 22
173 171 -35 35
174 172 -35 35
175 173 -18 18
176 174 -135 36
177 175 -15 15
178 176 -23 23
179 177 -28 28
180 178 -25 25
181 179 -24 24
182 180 -29 29
183 181 -31 31
184 182 -24 24
185 183 -129 30
186 184 -45 45
187 185 -24 24
188 186 -17 17
189 187 -20 20
190 188 -21 21
191 189 -23 23
192 190 -15 15
193 191 -32 32
194 192 -22 22
195 193 -19 19
196 194 -17 17
197 195 -45 45
198 196 -15 15
199 197 -14 14
200 198 -14 14
201 199 -37 37
202 200 -23 23
203 201 -17 17
204 202 -19 19
205 203 -21 21
206 204 -23 23
207 205 -27 27
208 206 -14 14
209 207 -18 18
210 208 -23 23
211 209 -34 34
212 210 -23 23
213 211 -13 13
214 212 -25 25
215 213 -17 17
216 214 -13 13
217 215 -21 21
218 216 -29 29
219 217 -18 18
220 218 -24 24
221 219 -15 15
222 220 -27 27
223 221 -25 25
224 222 -21 21
225 223 -19 19
226 224 -17 17
227 225 -18 18
228 226 -13 13
229 227 -22 22
230 228 -14 14
231 229 -13 13
232 230 -29 29
233 231 -23 23
234 232 -15 15
235 233 -15 15
236 234 -14 14
237 235 -28 28
238 236 -25 25
239 237 -17 17
240 238 -23 23
241 239 -29 29
242 240 -15 15
243 241 -14 14
244 242 -15 15
245 243 -23 23
246 244 -15 15
247 245 -16 16
248 246 -19 19
249 247 -13 13
250 248 -16 16
251 249 -17 17
252 250 -25 25
253 251 -30 30
254 252 -13 13
255 253 -14 14
256 254 -15 15
257 255 -22 22
258 256 -14 14
259 257 -17 17
260 258 -126 27
261 259 -15 15
262 260 -21 21
263 261 -16 16
264 262 -23 23
265 263 -14 14
266 264 -13 13
267 265 -13 13
268 266 -19 19
269 267 -13 13
270 268 -19 19
271 269 -17 17
272 270 -17 17
273 271 -13 13
274 272 -19 19
275 273 -13 13
276 274 -13 13
277 275 -16 16
278 276 -22 22
279 277 -14 14
280 278 -15 15
281 279 -19 19
282 280 -34 34
283 281 -13 13
284 282 -15 15
285 283 -32 32
286 284 -13 13
287 285 -13 13
288 286 -13 13
289 287 -14 14
290 288 -16 16
291 289 -13 13
292 290 -13 13
293 291 -17 17
294 292 -13 13
295 293 -13 13
296 294 -22 22
297 295 -14 14
298 296 -15 15
299 297 -13 13
300 298 -13 13
301 299 -13 13
302 300 -16 16
303 301 -13 13
304 302 -14 14
305 303 -13 13
306 304 -13 13
307 305 -13 13
308 306 -24 24
309 307 -13 13
310 308 -13 13
311 309 -15 15
312 310 -13 13
313 311 -13 13
314 312 -13 13
315 313 -15 15
316 314 -13 13
317 315 -19 19
318 316 -15 15
319 317 -17 17
320 318 -13 13
321 319 -13 13
322 320 -13 13
323 321 -13 13
324 322 -13 13
325 323 -15 15
326 324 -13 13
327 325 -13 13
328 326 -13 13
329 327 -123 24
330 328 -13 13
331 329 -13 13
332 330 -13 13
333 331 -13 13
334 332 -13 13
335 333 -13 13
336 334 -13 13
337 335 -13 13
338 336 -16 16
339 337 -13 13
340 338 -23 23
341 339 -13 13
342 340 -13 13
343 341 -13 13
344 342 -13 13
345 343 -13 13
346 344 -13 13
347 345 -13 13
348 346 -13 13
349 347 -13 13
350 348 -13 13
351 349 -13 13
352 350 -134 35
353 351 -13 13
354 352 -13 13
355 353 -13 13
356 354 -13 13
357 355 -13 13
358 356 -13 13
359 357 -13 13
360 358 -13 13
361 359 -13 13
362 360 -15 15
363 361 -13 13
364 362 -13 13
365 363 -13 13
366 364 -13 13
367 365 -13 13
368 366 -13 13
369 367 -13 13
370 368 -13 13
371 369 -14 14
372 370 -13 13
373 371 -13 13
374 372 -13 13
375 373 -13 13
376 374 -13 13
377 375 -13 13
378 376 -13 13
379 377 -124 25
380 378 -13 13
381 379 -13 13
382 380 -13 13
383 381 -13 13
384 382 -13 13
385 383 -13 13
386 384 -13 13
387 385 -13 13
388 386 -13 13
389 387 -13 13
390 388 -13 13
391 389 -121 22
392 390 -13 13
393 391 -13 13
394 392 -13 13
395 393 -13 13
396 394 -13 13
397 395 -13 13
398 396 -13 13
399 397 -13 13
400 398 -17 17
401 399 -13 13

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 53 KiB

View File

@@ -1,801 +0,0 @@
episodes,rewards,steps
0,0.0,20
1,0.0,14
2,0.0,13
3,0.0,9
4,0.0,10
5,0.0,6
6,0.0,11
7,0.0,6
8,0.0,3
9,0.0,9
10,0.0,11
11,0.0,22
12,0.0,5
13,0.0,16
14,0.0,4
15,0.0,9
16,0.0,18
17,0.0,2
18,0.0,4
19,0.0,8
20,0.0,7
21,0.0,4
22,0.0,22
23,0.0,15
24,0.0,5
25,0.0,16
26,0.0,7
27,0.0,19
28,0.0,22
29,0.0,16
30,0.0,11
31,0.0,22
32,0.0,28
33,0.0,23
34,0.0,4
35,0.0,11
36,0.0,8
37,0.0,15
38,0.0,5
39,0.0,7
40,0.0,9
41,0.0,4
42,0.0,3
43,0.0,6
44,0.0,41
45,0.0,9
46,0.0,23
47,0.0,3
48,1.0,38
49,0.0,29
50,0.0,17
51,0.0,4
52,0.0,2
53,0.0,25
54,0.0,6
55,0.0,2
56,0.0,30
57,0.0,6
58,0.0,7
59,0.0,11
60,0.0,9
61,0.0,8
62,0.0,23
63,0.0,10
64,0.0,3
65,0.0,5
66,0.0,7
67,0.0,18
68,0.0,8
69,0.0,26
70,0.0,6
71,0.0,14
72,0.0,4
73,0.0,25
74,0.0,21
75,0.0,13
76,0.0,4
77,0.0,29
78,0.0,21
79,0.0,6
80,0.0,6
81,0.0,11
82,0.0,21
83,0.0,9
84,0.0,9
85,0.0,7
86,0.0,48
87,0.0,23
88,0.0,100
89,0.0,60
90,0.0,7
91,0.0,10
92,0.0,24
93,0.0,4
94,0.0,7
95,0.0,17
96,0.0,87
97,0.0,28
98,0.0,7
99,0.0,5
100,0.0,12
101,0.0,14
102,0.0,6
103,0.0,13
104,0.0,93
105,0.0,4
106,0.0,50
107,0.0,8
108,0.0,12
109,0.0,43
110,0.0,30
111,0.0,15
112,0.0,19
113,0.0,100
114,0.0,82
115,0.0,40
116,0.0,88
117,0.0,19
118,0.0,30
119,0.0,27
120,0.0,5
121,0.0,87
122,0.0,9
123,0.0,64
124,0.0,27
125,0.0,68
126,0.0,81
127,0.0,86
128,0.0,100
129,0.0,100
130,0.0,27
131,0.0,41
132,0.0,70
133,0.0,27
134,0.0,6
135,0.0,18
136,0.0,38
137,0.0,26
138,0.0,36
139,0.0,3
140,0.0,61
141,0.0,100
142,0.0,4
143,0.0,39
144,0.0,18
145,0.0,33
146,0.0,29
147,0.0,49
148,0.0,88
149,0.0,22
150,0.0,65
151,0.0,36
152,0.0,30
153,0.0,58
154,0.0,43
155,0.0,53
156,0.0,43
157,0.0,13
158,0.0,8
159,0.0,39
160,0.0,29
161,0.0,26
162,0.0,60
163,0.0,100
164,0.0,31
165,0.0,22
166,0.0,100
167,0.0,46
168,0.0,23
169,0.0,54
170,0.0,8
171,0.0,58
172,0.0,3
173,0.0,47
174,0.0,16
175,0.0,21
176,0.0,44
177,0.0,29
178,0.0,100
179,0.0,100
180,0.0,62
181,0.0,83
182,0.0,26
183,0.0,24
184,0.0,10
185,0.0,12
186,0.0,40
187,0.0,25
188,0.0,18
189,0.0,60
190,0.0,100
191,0.0,100
192,0.0,24
193,0.0,56
194,0.0,71
195,0.0,19
196,0.0,100
197,0.0,44
198,0.0,41
199,0.0,41
200,0.0,60
201,0.0,31
202,0.0,34
203,0.0,35
204,0.0,59
205,0.0,51
206,0.0,100
207,0.0,100
208,0.0,100
209,0.0,100
210,0.0,37
211,0.0,68
212,0.0,40
213,0.0,17
214,0.0,79
215,0.0,100
216,0.0,26
217,0.0,61
218,0.0,25
219,0.0,18
220,0.0,27
221,0.0,13
222,0.0,100
223,0.0,87
224,0.0,100
225,0.0,92
226,0.0,100
227,0.0,8
228,0.0,100
229,0.0,64
230,0.0,17
231,0.0,82
232,0.0,100
233,0.0,94
234,0.0,7
235,0.0,36
236,0.0,100
237,0.0,56
238,0.0,17
239,0.0,100
240,0.0,83
241,0.0,100
242,0.0,100
243,0.0,43
244,0.0,87
245,0.0,42
246,0.0,80
247,0.0,54
248,0.0,82
249,0.0,97
250,0.0,65
251,0.0,83
252,0.0,100
253,0.0,59
254,0.0,100
255,0.0,78
256,0.0,100
257,0.0,100
258,0.0,43
259,0.0,80
260,0.0,100
261,0.0,70
262,0.0,94
263,0.0,100
264,0.0,100
265,0.0,37
266,0.0,11
267,0.0,31
268,0.0,100
269,0.0,34
270,0.0,32
271,0.0,58
272,0.0,38
273,0.0,28
274,0.0,100
275,0.0,59
276,0.0,100
277,0.0,82
278,0.0,51
279,0.0,25
280,0.0,73
281,0.0,56
282,0.0,55
283,0.0,38
284,0.0,100
285,0.0,100
286,0.0,92
287,0.0,100
288,0.0,100
289,0.0,100
290,0.0,37
291,0.0,100
292,0.0,66
293,0.0,24
294,0.0,17
295,0.0,100
296,0.0,59
297,0.0,25
298,0.0,73
299,0.0,100
300,0.0,29
301,0.0,100
302,0.0,72
303,0.0,6
304,1.0,57
305,0.0,47
306,0.0,48
307,0.0,13
308,0.0,100
309,0.0,38
310,0.0,100
311,0.0,20
312,0.0,100
313,0.0,100
314,0.0,5
315,0.0,39
316,0.0,11
317,0.0,83
318,0.0,42
319,0.0,100
320,0.0,99
321,0.0,83
322,0.0,28
323,0.0,46
324,0.0,100
325,0.0,100
326,0.0,62
327,0.0,100
328,0.0,23
329,0.0,91
330,0.0,53
331,0.0,19
332,0.0,26
333,0.0,93
334,0.0,38
335,0.0,22
336,0.0,43
337,0.0,100
338,0.0,90
339,0.0,18
340,0.0,45
341,0.0,65
342,1.0,22
343,0.0,100
344,1.0,15
345,1.0,72
346,0.0,5
347,1.0,6
348,1.0,6
349,1.0,9
350,1.0,8
351,1.0,9
352,1.0,8
353,1.0,6
354,1.0,6
355,1.0,10
356,1.0,6
357,0.0,5
358,0.0,3
359,1.0,6
360,1.0,6
361,1.0,6
362,1.0,6
363,1.0,8
364,1.0,6
365,1.0,8
366,1.0,6
367,1.0,6
368,1.0,8
369,1.0,6
370,1.0,6
371,0.0,5
372,1.0,6
373,0.0,6
374,1.0,6
375,1.0,12
376,1.0,6
377,1.0,6
378,1.0,9
379,1.0,6
380,1.0,6
381,0.0,2
382,0.0,3
383,0.0,2
384,0.0,4
385,0.0,3
386,1.0,7
387,1.0,6
388,1.0,6
389,1.0,8
390,1.0,9
391,1.0,8
392,1.0,8
393,1.0,6
394,1.0,6
395,1.0,7
396,1.0,6
397,0.0,5
398,0.0,5
399,1.0,10
400,1.0,6
401,0.0,3
402,1.0,6
403,1.0,7
404,1.0,6
405,1.0,6
406,1.0,6
407,1.0,6
408,1.0,6
409,1.0,6
410,1.0,6
411,0.0,5
412,1.0,6
413,1.0,6
414,0.0,2
415,1.0,6
416,1.0,6
417,1.0,6
418,1.0,6
419,1.0,6
420,1.0,8
421,1.0,6
422,1.0,6
423,1.0,6
424,1.0,6
425,1.0,7
426,0.0,5
427,1.0,6
428,1.0,6
429,1.0,6
430,1.0,8
431,1.0,6
432,1.0,6
433,1.0,6
434,1.0,6
435,0.0,2
436,1.0,8
437,1.0,7
438,1.0,6
439,1.0,7
440,1.0,6
441,1.0,6
442,0.0,3
443,0.0,4
444,1.0,6
445,1.0,6
446,1.0,7
447,1.0,6
448,1.0,6
449,1.0,6
450,1.0,6
451,1.0,6
452,1.0,6
453,1.0,8
454,1.0,6
455,1.0,6
456,1.0,6
457,1.0,6
458,1.0,6
459,1.0,7
460,1.0,8
461,1.0,6
462,1.0,7
463,1.0,6
464,1.0,6
465,1.0,6
466,1.0,6
467,1.0,8
468,1.0,6
469,1.0,6
470,1.0,8
471,1.0,6
472,1.0,11
473,1.0,6
474,1.0,6
475,1.0,6
476,1.0,8
477,0.0,2
478,1.0,7
479,1.0,6
480,1.0,6
481,1.0,7
482,1.0,6
483,1.0,6
484,1.0,6
485,1.0,6
486,0.0,3
487,1.0,7
488,1.0,6
489,1.0,6
490,1.0,6
491,0.0,3
492,1.0,6
493,1.0,7
494,1.0,12
495,1.0,6
496,0.0,9
497,1.0,6
498,1.0,6
499,0.0,8
500,1.0,6
501,0.0,3
502,0.0,5
503,0.0,3
504,1.0,6
505,1.0,6
506,1.0,6
507,1.0,6
508,1.0,6
509,1.0,6
510,1.0,6
511,1.0,6
512,1.0,6
513,1.0,6
514,0.0,2
515,1.0,7
516,1.0,6
517,1.0,6
518,1.0,6
519,1.0,6
520,1.0,6
521,1.0,7
522,0.0,4
523,1.0,6
524,0.0,5
525,1.0,6
526,1.0,6
527,1.0,6
528,1.0,6
529,0.0,3
530,1.0,6
531,1.0,6
532,1.0,6
533,1.0,7
534,1.0,8
535,1.0,6
536,1.0,6
537,1.0,6
538,1.0,6
539,1.0,7
540,1.0,7
541,1.0,7
542,1.0,8
543,1.0,6
544,1.0,10
545,1.0,6
546,1.0,6
547,1.0,6
548,1.0,8
549,1.0,6
550,1.0,6
551,1.0,8
552,1.0,6
553,1.0,7
554,1.0,6
555,1.0,7
556,1.0,6
557,1.0,6
558,1.0,7
559,1.0,7
560,1.0,7
561,1.0,6
562,1.0,6
563,1.0,6
564,1.0,6
565,1.0,6
566,1.0,6
567,1.0,6
568,1.0,7
569,0.0,4
570,1.0,8
571,1.0,8
572,1.0,7
573,1.0,6
574,1.0,8
575,1.0,6
576,1.0,6
577,1.0,7
578,1.0,6
579,1.0,6
580,1.0,8
581,1.0,7
582,1.0,6
583,1.0,6
584,0.0,3
585,1.0,11
586,1.0,6
587,1.0,8
588,0.0,2
589,1.0,6
590,1.0,6
591,1.0,6
592,1.0,6
593,1.0,8
594,1.0,6
595,1.0,7
596,1.0,6
597,1.0,7
598,1.0,6
599,1.0,8
600,0.0,2
601,1.0,6
602,1.0,7
603,1.0,6
604,1.0,6
605,1.0,10
606,1.0,7
607,1.0,6
608,1.0,6
609,1.0,6
610,1.0,6
611,1.0,6
612,1.0,7
613,0.0,4
614,1.0,7
615,1.0,6
616,1.0,8
617,0.0,3
618,1.0,6
619,1.0,6
620,1.0,6
621,1.0,6
622,0.0,2
623,1.0,6
624,1.0,6
625,1.0,6
626,1.0,6
627,1.0,6
628,1.0,7
629,1.0,6
630,1.0,6
631,1.0,7
632,1.0,6
633,1.0,6
634,1.0,6
635,1.0,6
636,1.0,6
637,1.0,6
638,1.0,6
639,1.0,8
640,1.0,6
641,1.0,8
642,1.0,7
643,1.0,6
644,0.0,3
645,1.0,6
646,1.0,7
647,1.0,6
648,1.0,6
649,1.0,6
650,1.0,10
651,1.0,6
652,1.0,6
653,1.0,6
654,1.0,6
655,1.0,10
656,1.0,6
657,1.0,8
658,1.0,8
659,1.0,7
660,1.0,6
661,0.0,5
662,0.0,2
663,1.0,8
664,1.0,6
665,1.0,10
666,1.0,6
667,1.0,8
668,1.0,10
669,1.0,6
670,1.0,6
671,1.0,6
672,1.0,10
673,1.0,6
674,0.0,4
675,1.0,6
676,1.0,6
677,1.0,6
678,1.0,15
679,1.0,6
680,1.0,6
681,1.0,6
682,1.0,6
683,1.0,6
684,1.0,6
685,1.0,8
686,1.0,6
687,1.0,7
688,1.0,6
689,1.0,6
690,1.0,8
691,1.0,6
692,1.0,6
693,1.0,8
694,1.0,8
695,1.0,6
696,1.0,6
697,1.0,6
698,1.0,10
699,1.0,6
700,1.0,6
701,1.0,6
702,1.0,6
703,1.0,6
704,1.0,6
705,1.0,6
706,1.0,8
707,1.0,8
708,1.0,6
709,1.0,6
710,0.0,2
711,1.0,6
712,1.0,6
713,1.0,6
714,1.0,8
715,1.0,6
716,1.0,6
717,1.0,6
718,1.0,6
719,1.0,6
720,1.0,6
721,1.0,6
722,1.0,6
723,1.0,6
724,1.0,7
725,0.0,3
726,1.0,7
727,1.0,6
728,1.0,6
729,1.0,6
730,0.0,2
731,1.0,6
732,1.0,8
733,1.0,6
734,1.0,6
735,1.0,6
736,1.0,6
737,1.0,9
738,1.0,6
739,1.0,6
740,1.0,6
741,1.0,6
742,1.0,6
743,1.0,6
744,1.0,9
745,1.0,7
746,0.0,4
747,1.0,6
748,1.0,8
749,1.0,11
750,1.0,6
751,1.0,6
752,1.0,6
753,1.0,6
754,1.0,6
755,1.0,8
756,1.0,6
757,1.0,6
758,1.0,8
759,1.0,7
760,1.0,6
761,1.0,8
762,1.0,6
763,0.0,5
764,1.0,9
765,1.0,8
766,1.0,8
767,1.0,6
768,1.0,8
769,1.0,8
770,1.0,6
771,0.0,5
772,0.0,3
773,0.0,2
774,1.0,8
775,1.0,6
776,1.0,6
777,1.0,6
778,1.0,6
779,1.0,6
780,1.0,6
781,1.0,6
782,1.0,6
783,1.0,6
784,1.0,6
785,1.0,6
786,1.0,6
787,1.0,6
788,1.0,6
789,0.0,2
790,1.0,6
791,0.0,4
792,1.0,6
793,1.0,6
794,1.0,6
795,1.0,6
796,1.0,6
797,1.0,8
798,0.0,5
799,1.0,6
1 episodes rewards steps
2 0 0.0 20
3 1 0.0 14
4 2 0.0 13
5 3 0.0 9
6 4 0.0 10
7 5 0.0 6
8 6 0.0 11
9 7 0.0 6
10 8 0.0 3
11 9 0.0 9
12 10 0.0 11
13 11 0.0 22
14 12 0.0 5
15 13 0.0 16
16 14 0.0 4
17 15 0.0 9
18 16 0.0 18
19 17 0.0 2
20 18 0.0 4
21 19 0.0 8
22 20 0.0 7
23 21 0.0 4
24 22 0.0 22
25 23 0.0 15
26 24 0.0 5
27 25 0.0 16
28 26 0.0 7
29 27 0.0 19
30 28 0.0 22
31 29 0.0 16
32 30 0.0 11
33 31 0.0 22
34 32 0.0 28
35 33 0.0 23
36 34 0.0 4
37 35 0.0 11
38 36 0.0 8
39 37 0.0 15
40 38 0.0 5
41 39 0.0 7
42 40 0.0 9
43 41 0.0 4
44 42 0.0 3
45 43 0.0 6
46 44 0.0 41
47 45 0.0 9
48 46 0.0 23
49 47 0.0 3
50 48 1.0 38
51 49 0.0 29
52 50 0.0 17
53 51 0.0 4
54 52 0.0 2
55 53 0.0 25
56 54 0.0 6
57 55 0.0 2
58 56 0.0 30
59 57 0.0 6
60 58 0.0 7
61 59 0.0 11
62 60 0.0 9
63 61 0.0 8
64 62 0.0 23
65 63 0.0 10
66 64 0.0 3
67 65 0.0 5
68 66 0.0 7
69 67 0.0 18
70 68 0.0 8
71 69 0.0 26
72 70 0.0 6
73 71 0.0 14
74 72 0.0 4
75 73 0.0 25
76 74 0.0 21
77 75 0.0 13
78 76 0.0 4
79 77 0.0 29
80 78 0.0 21
81 79 0.0 6
82 80 0.0 6
83 81 0.0 11
84 82 0.0 21
85 83 0.0 9
86 84 0.0 9
87 85 0.0 7
88 86 0.0 48
89 87 0.0 23
90 88 0.0 100
91 89 0.0 60
92 90 0.0 7
93 91 0.0 10
94 92 0.0 24
95 93 0.0 4
96 94 0.0 7
97 95 0.0 17
98 96 0.0 87
99 97 0.0 28
100 98 0.0 7
101 99 0.0 5
102 100 0.0 12
103 101 0.0 14
104 102 0.0 6
105 103 0.0 13
106 104 0.0 93
107 105 0.0 4
108 106 0.0 50
109 107 0.0 8
110 108 0.0 12
111 109 0.0 43
112 110 0.0 30
113 111 0.0 15
114 112 0.0 19
115 113 0.0 100
116 114 0.0 82
117 115 0.0 40
118 116 0.0 88
119 117 0.0 19
120 118 0.0 30
121 119 0.0 27
122 120 0.0 5
123 121 0.0 87
124 122 0.0 9
125 123 0.0 64
126 124 0.0 27
127 125 0.0 68
128 126 0.0 81
129 127 0.0 86
130 128 0.0 100
131 129 0.0 100
132 130 0.0 27
133 131 0.0 41
134 132 0.0 70
135 133 0.0 27
136 134 0.0 6
137 135 0.0 18
138 136 0.0 38
139 137 0.0 26
140 138 0.0 36
141 139 0.0 3
142 140 0.0 61
143 141 0.0 100
144 142 0.0 4
145 143 0.0 39
146 144 0.0 18
147 145 0.0 33
148 146 0.0 29
149 147 0.0 49
150 148 0.0 88
151 149 0.0 22
152 150 0.0 65
153 151 0.0 36
154 152 0.0 30
155 153 0.0 58
156 154 0.0 43
157 155 0.0 53
158 156 0.0 43
159 157 0.0 13
160 158 0.0 8
161 159 0.0 39
162 160 0.0 29
163 161 0.0 26
164 162 0.0 60
165 163 0.0 100
166 164 0.0 31
167 165 0.0 22
168 166 0.0 100
169 167 0.0 46
170 168 0.0 23
171 169 0.0 54
172 170 0.0 8
173 171 0.0 58
174 172 0.0 3
175 173 0.0 47
176 174 0.0 16
177 175 0.0 21
178 176 0.0 44
179 177 0.0 29
180 178 0.0 100
181 179 0.0 100
182 180 0.0 62
183 181 0.0 83
184 182 0.0 26
185 183 0.0 24
186 184 0.0 10
187 185 0.0 12
188 186 0.0 40
189 187 0.0 25
190 188 0.0 18
191 189 0.0 60
192 190 0.0 100
193 191 0.0 100
194 192 0.0 24
195 193 0.0 56
196 194 0.0 71
197 195 0.0 19
198 196 0.0 100
199 197 0.0 44
200 198 0.0 41
201 199 0.0 41
202 200 0.0 60
203 201 0.0 31
204 202 0.0 34
205 203 0.0 35
206 204 0.0 59
207 205 0.0 51
208 206 0.0 100
209 207 0.0 100
210 208 0.0 100
211 209 0.0 100
212 210 0.0 37
213 211 0.0 68
214 212 0.0 40
215 213 0.0 17
216 214 0.0 79
217 215 0.0 100
218 216 0.0 26
219 217 0.0 61
220 218 0.0 25
221 219 0.0 18
222 220 0.0 27
223 221 0.0 13
224 222 0.0 100
225 223 0.0 87
226 224 0.0 100
227 225 0.0 92
228 226 0.0 100
229 227 0.0 8
230 228 0.0 100
231 229 0.0 64
232 230 0.0 17
233 231 0.0 82
234 232 0.0 100
235 233 0.0 94
236 234 0.0 7
237 235 0.0 36
238 236 0.0 100
239 237 0.0 56
240 238 0.0 17
241 239 0.0 100
242 240 0.0 83
243 241 0.0 100
244 242 0.0 100
245 243 0.0 43
246 244 0.0 87
247 245 0.0 42
248 246 0.0 80
249 247 0.0 54
250 248 0.0 82
251 249 0.0 97
252 250 0.0 65
253 251 0.0 83
254 252 0.0 100
255 253 0.0 59
256 254 0.0 100
257 255 0.0 78
258 256 0.0 100
259 257 0.0 100
260 258 0.0 43
261 259 0.0 80
262 260 0.0 100
263 261 0.0 70
264 262 0.0 94
265 263 0.0 100
266 264 0.0 100
267 265 0.0 37
268 266 0.0 11
269 267 0.0 31
270 268 0.0 100
271 269 0.0 34
272 270 0.0 32
273 271 0.0 58
274 272 0.0 38
275 273 0.0 28
276 274 0.0 100
277 275 0.0 59
278 276 0.0 100
279 277 0.0 82
280 278 0.0 51
281 279 0.0 25
282 280 0.0 73
283 281 0.0 56
284 282 0.0 55
285 283 0.0 38
286 284 0.0 100
287 285 0.0 100
288 286 0.0 92
289 287 0.0 100
290 288 0.0 100
291 289 0.0 100
292 290 0.0 37
293 291 0.0 100
294 292 0.0 66
295 293 0.0 24
296 294 0.0 17
297 295 0.0 100
298 296 0.0 59
299 297 0.0 25
300 298 0.0 73
301 299 0.0 100
302 300 0.0 29
303 301 0.0 100
304 302 0.0 72
305 303 0.0 6
306 304 1.0 57
307 305 0.0 47
308 306 0.0 48
309 307 0.0 13
310 308 0.0 100
311 309 0.0 38
312 310 0.0 100
313 311 0.0 20
314 312 0.0 100
315 313 0.0 100
316 314 0.0 5
317 315 0.0 39
318 316 0.0 11
319 317 0.0 83
320 318 0.0 42
321 319 0.0 100
322 320 0.0 99
323 321 0.0 83
324 322 0.0 28
325 323 0.0 46
326 324 0.0 100
327 325 0.0 100
328 326 0.0 62
329 327 0.0 100
330 328 0.0 23
331 329 0.0 91
332 330 0.0 53
333 331 0.0 19
334 332 0.0 26
335 333 0.0 93
336 334 0.0 38
337 335 0.0 22
338 336 0.0 43
339 337 0.0 100
340 338 0.0 90
341 339 0.0 18
342 340 0.0 45
343 341 0.0 65
344 342 1.0 22
345 343 0.0 100
346 344 1.0 15
347 345 1.0 72
348 346 0.0 5
349 347 1.0 6
350 348 1.0 6
351 349 1.0 9
352 350 1.0 8
353 351 1.0 9
354 352 1.0 8
355 353 1.0 6
356 354 1.0 6
357 355 1.0 10
358 356 1.0 6
359 357 0.0 5
360 358 0.0 3
361 359 1.0 6
362 360 1.0 6
363 361 1.0 6
364 362 1.0 6
365 363 1.0 8
366 364 1.0 6
367 365 1.0 8
368 366 1.0 6
369 367 1.0 6
370 368 1.0 8
371 369 1.0 6
372 370 1.0 6
373 371 0.0 5
374 372 1.0 6
375 373 0.0 6
376 374 1.0 6
377 375 1.0 12
378 376 1.0 6
379 377 1.0 6
380 378 1.0 9
381 379 1.0 6
382 380 1.0 6
383 381 0.0 2
384 382 0.0 3
385 383 0.0 2
386 384 0.0 4
387 385 0.0 3
388 386 1.0 7
389 387 1.0 6
390 388 1.0 6
391 389 1.0 8
392 390 1.0 9
393 391 1.0 8
394 392 1.0 8
395 393 1.0 6
396 394 1.0 6
397 395 1.0 7
398 396 1.0 6
399 397 0.0 5
400 398 0.0 5
401 399 1.0 10
402 400 1.0 6
403 401 0.0 3
404 402 1.0 6
405 403 1.0 7
406 404 1.0 6
407 405 1.0 6
408 406 1.0 6
409 407 1.0 6
410 408 1.0 6
411 409 1.0 6
412 410 1.0 6
413 411 0.0 5
414 412 1.0 6
415 413 1.0 6
416 414 0.0 2
417 415 1.0 6
418 416 1.0 6
419 417 1.0 6
420 418 1.0 6
421 419 1.0 6
422 420 1.0 8
423 421 1.0 6
424 422 1.0 6
425 423 1.0 6
426 424 1.0 6
427 425 1.0 7
428 426 0.0 5
429 427 1.0 6
430 428 1.0 6
431 429 1.0 6
432 430 1.0 8
433 431 1.0 6
434 432 1.0 6
435 433 1.0 6
436 434 1.0 6
437 435 0.0 2
438 436 1.0 8
439 437 1.0 7
440 438 1.0 6
441 439 1.0 7
442 440 1.0 6
443 441 1.0 6
444 442 0.0 3
445 443 0.0 4
446 444 1.0 6
447 445 1.0 6
448 446 1.0 7
449 447 1.0 6
450 448 1.0 6
451 449 1.0 6
452 450 1.0 6
453 451 1.0 6
454 452 1.0 6
455 453 1.0 8
456 454 1.0 6
457 455 1.0 6
458 456 1.0 6
459 457 1.0 6
460 458 1.0 6
461 459 1.0 7
462 460 1.0 8
463 461 1.0 6
464 462 1.0 7
465 463 1.0 6
466 464 1.0 6
467 465 1.0 6
468 466 1.0 6
469 467 1.0 8
470 468 1.0 6
471 469 1.0 6
472 470 1.0 8
473 471 1.0 6
474 472 1.0 11
475 473 1.0 6
476 474 1.0 6
477 475 1.0 6
478 476 1.0 8
479 477 0.0 2
480 478 1.0 7
481 479 1.0 6
482 480 1.0 6
483 481 1.0 7
484 482 1.0 6
485 483 1.0 6
486 484 1.0 6
487 485 1.0 6
488 486 0.0 3
489 487 1.0 7
490 488 1.0 6
491 489 1.0 6
492 490 1.0 6
493 491 0.0 3
494 492 1.0 6
495 493 1.0 7
496 494 1.0 12
497 495 1.0 6
498 496 0.0 9
499 497 1.0 6
500 498 1.0 6
501 499 0.0 8
502 500 1.0 6
503 501 0.0 3
504 502 0.0 5
505 503 0.0 3
506 504 1.0 6
507 505 1.0 6
508 506 1.0 6
509 507 1.0 6
510 508 1.0 6
511 509 1.0 6
512 510 1.0 6
513 511 1.0 6
514 512 1.0 6
515 513 1.0 6
516 514 0.0 2
517 515 1.0 7
518 516 1.0 6
519 517 1.0 6
520 518 1.0 6
521 519 1.0 6
522 520 1.0 6
523 521 1.0 7
524 522 0.0 4
525 523 1.0 6
526 524 0.0 5
527 525 1.0 6
528 526 1.0 6
529 527 1.0 6
530 528 1.0 6
531 529 0.0 3
532 530 1.0 6
533 531 1.0 6
534 532 1.0 6
535 533 1.0 7
536 534 1.0 8
537 535 1.0 6
538 536 1.0 6
539 537 1.0 6
540 538 1.0 6
541 539 1.0 7
542 540 1.0 7
543 541 1.0 7
544 542 1.0 8
545 543 1.0 6
546 544 1.0 10
547 545 1.0 6
548 546 1.0 6
549 547 1.0 6
550 548 1.0 8
551 549 1.0 6
552 550 1.0 6
553 551 1.0 8
554 552 1.0 6
555 553 1.0 7
556 554 1.0 6
557 555 1.0 7
558 556 1.0 6
559 557 1.0 6
560 558 1.0 7
561 559 1.0 7
562 560 1.0 7
563 561 1.0 6
564 562 1.0 6
565 563 1.0 6
566 564 1.0 6
567 565 1.0 6
568 566 1.0 6
569 567 1.0 6
570 568 1.0 7
571 569 0.0 4
572 570 1.0 8
573 571 1.0 8
574 572 1.0 7
575 573 1.0 6
576 574 1.0 8
577 575 1.0 6
578 576 1.0 6
579 577 1.0 7
580 578 1.0 6
581 579 1.0 6
582 580 1.0 8
583 581 1.0 7
584 582 1.0 6
585 583 1.0 6
586 584 0.0 3
587 585 1.0 11
588 586 1.0 6
589 587 1.0 8
590 588 0.0 2
591 589 1.0 6
592 590 1.0 6
593 591 1.0 6
594 592 1.0 6
595 593 1.0 8
596 594 1.0 6
597 595 1.0 7
598 596 1.0 6
599 597 1.0 7
600 598 1.0 6
601 599 1.0 8
602 600 0.0 2
603 601 1.0 6
604 602 1.0 7
605 603 1.0 6
606 604 1.0 6
607 605 1.0 10
608 606 1.0 7
609 607 1.0 6
610 608 1.0 6
611 609 1.0 6
612 610 1.0 6
613 611 1.0 6
614 612 1.0 7
615 613 0.0 4
616 614 1.0 7
617 615 1.0 6
618 616 1.0 8
619 617 0.0 3
620 618 1.0 6
621 619 1.0 6
622 620 1.0 6
623 621 1.0 6
624 622 0.0 2
625 623 1.0 6
626 624 1.0 6
627 625 1.0 6
628 626 1.0 6
629 627 1.0 6
630 628 1.0 7
631 629 1.0 6
632 630 1.0 6
633 631 1.0 7
634 632 1.0 6
635 633 1.0 6
636 634 1.0 6
637 635 1.0 6
638 636 1.0 6
639 637 1.0 6
640 638 1.0 6
641 639 1.0 8
642 640 1.0 6
643 641 1.0 8
644 642 1.0 7
645 643 1.0 6
646 644 0.0 3
647 645 1.0 6
648 646 1.0 7
649 647 1.0 6
650 648 1.0 6
651 649 1.0 6
652 650 1.0 10
653 651 1.0 6
654 652 1.0 6
655 653 1.0 6
656 654 1.0 6
657 655 1.0 10
658 656 1.0 6
659 657 1.0 8
660 658 1.0 8
661 659 1.0 7
662 660 1.0 6
663 661 0.0 5
664 662 0.0 2
665 663 1.0 8
666 664 1.0 6
667 665 1.0 10
668 666 1.0 6
669 667 1.0 8
670 668 1.0 10
671 669 1.0 6
672 670 1.0 6
673 671 1.0 6
674 672 1.0 10
675 673 1.0 6
676 674 0.0 4
677 675 1.0 6
678 676 1.0 6
679 677 1.0 6
680 678 1.0 15
681 679 1.0 6
682 680 1.0 6
683 681 1.0 6
684 682 1.0 6
685 683 1.0 6
686 684 1.0 6
687 685 1.0 8
688 686 1.0 6
689 687 1.0 7
690 688 1.0 6
691 689 1.0 6
692 690 1.0 8
693 691 1.0 6
694 692 1.0 6
695 693 1.0 8
696 694 1.0 8
697 695 1.0 6
698 696 1.0 6
699 697 1.0 6
700 698 1.0 10
701 699 1.0 6
702 700 1.0 6
703 701 1.0 6
704 702 1.0 6
705 703 1.0 6
706 704 1.0 6
707 705 1.0 6
708 706 1.0 8
709 707 1.0 8
710 708 1.0 6
711 709 1.0 6
712 710 0.0 2
713 711 1.0 6
714 712 1.0 6
715 713 1.0 6
716 714 1.0 8
717 715 1.0 6
718 716 1.0 6
719 717 1.0 6
720 718 1.0 6
721 719 1.0 6
722 720 1.0 6
723 721 1.0 6
724 722 1.0 6
725 723 1.0 6
726 724 1.0 7
727 725 0.0 3
728 726 1.0 7
729 727 1.0 6
730 728 1.0 6
731 729 1.0 6
732 730 0.0 2
733 731 1.0 6
734 732 1.0 8
735 733 1.0 6
736 734 1.0 6
737 735 1.0 6
738 736 1.0 6
739 737 1.0 9
740 738 1.0 6
741 739 1.0 6
742 740 1.0 6
743 741 1.0 6
744 742 1.0 6
745 743 1.0 6
746 744 1.0 9
747 745 1.0 7
748 746 0.0 4
749 747 1.0 6
750 748 1.0 8
751 749 1.0 11
752 750 1.0 6
753 751 1.0 6
754 752 1.0 6
755 753 1.0 6
756 754 1.0 6
757 755 1.0 8
758 756 1.0 6
759 757 1.0 6
760 758 1.0 8
761 759 1.0 7
762 760 1.0 6
763 761 1.0 8
764 762 1.0 6
765 763 0.0 5
766 764 1.0 9
767 765 1.0 8
768 766 1.0 8
769 767 1.0 6
770 768 1.0 8
771 769 1.0 8
772 770 1.0 6
773 771 0.0 5
774 772 0.0 3
775 773 0.0 2
776 774 1.0 8
777 775 1.0 6
778 776 1.0 6
779 777 1.0 6
780 778 1.0 6
781 779 1.0 6
782 780 1.0 6
783 781 1.0 6
784 782 1.0 6
785 783 1.0 6
786 784 1.0 6
787 785 1.0 6
788 786 1.0 6
789 787 1.0 6
790 788 1.0 6
791 789 0.0 2
792 790 1.0 6
793 791 0.0 4
794 792 1.0 6
795 793 1.0 6
796 794 1.0 6
797 795 1.0 6
798 796 1.0 6
799 797 1.0 8
800 798 0.0 5
801 799 1.0 6

View File

@@ -1,6 +1,6 @@
{
"algo_name": "Q-learning",
"env_name": "FrozenLake-v1",
"env_name": "FrozenLakeNoSlippery-v1",
"train_eps": 800,
"test_eps": 20,
"gamma": 0.9,
@@ -12,8 +12,8 @@
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLake-v1/20220824-112735/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLake-v1/20220824-112735/models/",
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLakeNoSlippery-v1/20220825-114335/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLakeNoSlippery-v1/20220825-114335/models/",
"n_states": 16,
"n_actions": 4
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

View File

@@ -0,0 +1,801 @@
episodes,rewards,steps
0,0.0,20
1,0.0,14
2,0.0,13
3,0.0,9
4,0.0,10
5,0.0,6
6,0.0,11
7,0.0,6
8,0.0,3
9,0.0,9
10,0.0,11
11,0.0,22
12,0.0,5
13,0.0,16
14,0.0,4
15,0.0,9
16,0.0,18
17,0.0,2
18,0.0,4
19,0.0,8
20,0.0,7
21,0.0,4
22,0.0,22
23,0.0,15
24,0.0,5
25,0.0,16
26,0.0,7
27,0.0,19
28,0.0,22
29,0.0,16
30,0.0,11
31,0.0,22
32,0.0,28
33,0.0,23
34,0.0,4
35,0.0,11
36,0.0,8
37,0.0,15
38,0.0,5
39,0.0,7
40,0.0,9
41,0.0,4
42,0.0,3
43,0.0,6
44,0.0,41
45,0.0,9
46,0.0,23
47,0.0,3
48,1.0,38
49,0.0,29
50,0.0,17
51,0.0,4
52,0.0,2
53,0.0,25
54,0.0,6
55,0.0,2
56,0.0,30
57,0.0,6
58,0.0,7
59,0.0,11
60,0.0,9
61,0.0,8
62,0.0,23
63,0.0,10
64,0.0,3
65,0.0,5
66,0.0,7
67,0.0,18
68,0.0,8
69,0.0,26
70,0.0,6
71,0.0,14
72,0.0,4
73,0.0,25
74,0.0,21
75,0.0,13
76,0.0,4
77,0.0,29
78,0.0,21
79,0.0,6
80,0.0,6
81,0.0,11
82,0.0,21
83,0.0,9
84,0.0,9
85,0.0,7
86,0.0,48
87,0.0,23
88,0.0,160
89,0.0,7
90,0.0,10
91,0.0,24
92,0.0,4
93,0.0,7
94,0.0,17
95,0.0,87
96,0.0,28
97,0.0,7
98,0.0,5
99,0.0,12
100,0.0,14
101,0.0,6
102,0.0,13
103,0.0,93
104,0.0,4
105,0.0,50
106,0.0,8
107,0.0,12
108,0.0,43
109,0.0,30
110,0.0,15
111,0.0,19
112,0.0,182
113,0.0,40
114,0.0,88
115,0.0,19
116,0.0,30
117,0.0,27
118,0.0,5
119,0.0,87
120,0.0,9
121,0.0,64
122,0.0,27
123,0.0,68
124,0.0,81
125,0.0,86
126,0.0,227
127,0.0,41
128,0.0,70
129,0.0,27
130,0.0,6
131,0.0,18
132,0.0,38
133,0.0,26
134,0.0,36
135,0.0,3
136,0.0,61
137,0.0,105
138,0.0,38
139,0.0,18
140,0.0,33
141,0.0,29
142,0.0,49
143,0.0,88
144,0.0,22
145,0.0,65
146,0.0,36
147,0.0,30
148,0.0,58
149,0.0,43
150,0.0,53
151,0.0,43
152,0.0,13
153,0.0,8
154,0.0,39
155,0.0,29
156,0.0,26
157,0.0,60
158,0.0,153
159,0.0,116
160,0.0,53
161,0.0,54
162,0.0,8
163,0.0,58
164,0.0,3
165,0.0,47
166,0.0,16
167,0.0,21
168,0.0,44
169,0.0,29
170,0.0,104
171,0.0,158
172,0.0,83
173,0.0,26
174,0.0,24
175,0.0,10
176,0.0,12
177,0.0,40
178,0.0,25
179,0.0,18
180,0.0,60
181,0.0,203
182,0.0,23
183,0.0,54
184,0.0,71
185,0.0,19
186,0.0,118
187,0.0,26
188,0.0,41
189,0.0,41
190,0.0,60
191,0.0,31
192,0.0,34
193,0.0,35
194,0.0,59
195,0.0,51
196,0.0,426
197,0.0,79
198,0.0,40
199,0.0,17
200,0.0,79
201,0.0,126
202,0.0,61
203,0.0,25
204,0.0,18
205,0.0,27
206,0.0,13
207,0.0,187
208,0.0,160
209,0.0,32
210,0.0,108
211,0.0,164
212,0.0,17
213,0.0,82
214,0.0,194
215,0.0,7
216,0.0,36
217,0.0,156
218,0.0,17
219,0.0,183
220,0.0,243
221,0.0,87
222,0.0,42
223,0.0,80
224,0.0,54
225,0.0,82
226,0.0,97
227,0.0,65
228,0.0,83
229,0.0,159
230,0.0,178
231,0.0,104
232,0.0,21
233,0.0,118
234,0.0,80
235,0.0,170
236,0.0,94
237,0.0,235
238,0.0,13
239,0.0,31
240,0.0,134
241,0.0,32
242,0.0,58
243,0.0,38
244,0.0,28
245,0.0,159
246,0.0,182
247,0.0,51
248,0.0,25
249,0.0,73
250,0.0,56
251,0.0,55
252,0.0,38
253,0.0,292
254,0.0,319
255,0.0,100
256,0.0,84
257,0.0,24
258,0.0,17
259,0.0,159
260,0.0,25
261,0.0,73
262,0.0,130
263,0.0,111
264,0.0,65
265,1.0,58
266,0.0,47
267,0.0,48
268,0.0,13
269,0.0,100
270,0.0,38
271,0.0,111
272,0.0,226
273,0.0,38
274,0.0,83
275,0.0,42
276,0.0,199
277,0.0,83
278,0.0,28
279,0.0,46
280,0.0,262
281,0.0,123
282,0.0,91
283,0.0,53
284,0.0,19
285,0.0,26
286,0.0,93
287,0.0,38
288,0.0,22
289,0.0,43
290,0.0,163
291,0.0,25
292,0.0,59
293,0.0,71
294,0.0,20
295,0.0,115
296,0.0,248
297,0.0,66
298,0.0,58
299,0.0,129
300,0.0,122
301,0.0,47
302,0.0,60
303,0.0,79
304,1.0,137
305,0.0,27
306,1.0,93
307,0.0,46
308,1.0,83
309,1.0,8
310,1.0,6
311,1.0,6
312,0.0,4
313,1.0,6
314,0.0,2
315,1.0,6
316,1.0,6
317,1.0,6
318,1.0,6
319,1.0,8
320,0.0,5
321,1.0,6
322,1.0,7
323,0.0,5
324,1.0,6
325,1.0,6
326,1.0,8
327,1.0,6
328,1.0,6
329,1.0,6
330,1.0,7
331,1.0,6
332,1.0,6
333,0.0,3
334,1.0,7
335,0.0,4
336,1.0,6
337,1.0,6
338,1.0,7
339,1.0,6
340,1.0,6
341,1.0,7
342,1.0,7
343,1.0,7
344,1.0,6
345,1.0,6
346,1.0,6
347,1.0,6
348,1.0,6
349,1.0,6
350,1.0,6
351,1.0,7
352,0.0,4
353,1.0,8
354,1.0,8
355,1.0,7
356,1.0,6
357,1.0,8
358,1.0,6
359,1.0,6
360,1.0,7
361,1.0,6
362,1.0,6
363,1.0,8
364,1.0,7
365,1.0,6
366,1.0,6
367,0.0,3
368,1.0,11
369,1.0,6
370,1.0,8
371,0.0,2
372,1.0,6
373,1.0,6
374,1.0,6
375,1.0,6
376,1.0,8
377,1.0,6
378,1.0,7
379,1.0,6
380,1.0,7
381,1.0,6
382,1.0,8
383,0.0,2
384,1.0,6
385,1.0,7
386,1.0,6
387,1.0,6
388,1.0,10
389,1.0,7
390,1.0,6
391,1.0,6
392,1.0,6
393,1.0,6
394,1.0,6
395,1.0,7
396,0.0,4
397,1.0,7
398,1.0,6
399,1.0,8
400,0.0,3
401,1.0,6
402,1.0,6
403,1.0,6
404,1.0,6
405,0.0,2
406,1.0,6
407,1.0,6
408,1.0,6
409,1.0,6
410,1.0,6
411,1.0,7
412,1.0,6
413,1.0,6
414,1.0,7
415,1.0,6
416,1.0,6
417,1.0,6
418,1.0,6
419,1.0,6
420,1.0,6
421,1.0,6
422,1.0,8
423,1.0,6
424,1.0,8
425,1.0,7
426,1.0,6
427,0.0,3
428,1.0,6
429,1.0,7
430,1.0,6
431,1.0,6
432,1.0,6
433,1.0,10
434,1.0,6
435,1.0,6
436,1.0,6
437,1.0,6
438,1.0,10
439,1.0,6
440,1.0,8
441,1.0,8
442,1.0,7
443,1.0,6
444,0.0,5
445,0.0,2
446,1.0,8
447,1.0,6
448,1.0,10
449,1.0,6
450,1.0,8
451,1.0,10
452,1.0,6
453,1.0,6
454,1.0,6
455,1.0,10
456,1.0,6
457,0.0,4
458,1.0,6
459,1.0,6
460,1.0,6
461,1.0,15
462,1.0,6
463,1.0,6
464,1.0,6
465,1.0,6
466,1.0,6
467,1.0,6
468,1.0,8
469,1.0,6
470,1.0,7
471,1.0,6
472,1.0,6
473,1.0,8
474,1.0,6
475,1.0,6
476,1.0,8
477,1.0,8
478,1.0,6
479,1.0,6
480,1.0,6
481,1.0,10
482,1.0,6
483,1.0,6
484,1.0,6
485,1.0,6
486,1.0,6
487,1.0,6
488,1.0,6
489,1.0,8
490,1.0,8
491,1.0,6
492,1.0,6
493,0.0,2
494,1.0,6
495,1.0,6
496,1.0,6
497,1.0,8
498,1.0,6
499,1.0,6
500,1.0,6
501,1.0,6
502,1.0,6
503,1.0,6
504,1.0,6
505,1.0,6
506,1.0,6
507,1.0,7
508,0.0,3
509,1.0,7
510,1.0,6
511,1.0,6
512,1.0,6
513,0.0,2
514,1.0,6
515,1.0,8
516,1.0,6
517,1.0,6
518,1.0,6
519,1.0,6
520,1.0,9
521,1.0,6
522,1.0,6
523,1.0,6
524,1.0,6
525,1.0,6
526,1.0,6
527,1.0,9
528,1.0,7
529,0.0,4
530,1.0,6
531,1.0,8
532,1.0,11
533,1.0,6
534,1.0,6
535,1.0,6
536,1.0,6
537,1.0,6
538,1.0,8
539,1.0,6
540,1.0,6
541,1.0,8
542,1.0,7
543,1.0,6
544,1.0,8
545,1.0,6
546,0.0,5
547,1.0,9
548,1.0,8
549,1.0,8
550,1.0,6
551,1.0,8
552,1.0,8
553,1.0,6
554,0.0,5
555,0.0,3
556,0.0,2
557,1.0,8
558,1.0,6
559,1.0,6
560,1.0,6
561,1.0,6
562,1.0,6
563,1.0,6
564,1.0,6
565,1.0,6
566,1.0,6
567,1.0,6
568,1.0,6
569,1.0,6
570,1.0,6
571,1.0,6
572,0.0,2
573,1.0,6
574,0.0,4
575,1.0,6
576,1.0,6
577,1.0,6
578,1.0,6
579,1.0,6
580,1.0,8
581,0.0,5
582,1.0,6
583,1.0,6
584,1.0,6
585,1.0,6
586,1.0,6
587,1.0,6
588,0.0,3
589,1.0,6
590,1.0,6
591,1.0,6
592,0.0,2
593,1.0,6
594,0.0,4
595,1.0,6
596,1.0,6
597,1.0,6
598,1.0,6
599,1.0,8
600,1.0,6
601,1.0,7
602,1.0,6
603,1.0,7
604,1.0,6
605,0.0,2
606,1.0,6
607,1.0,6
608,0.0,5
609,0.0,3
610,0.0,3
611,1.0,6
612,0.0,5
613,1.0,8
614,1.0,8
615,1.0,6
616,1.0,6
617,1.0,7
618,1.0,6
619,1.0,6
620,1.0,6
621,1.0,6
622,1.0,6
623,1.0,8
624,0.0,2
625,1.0,6
626,1.0,6
627,1.0,6
628,1.0,6
629,1.0,6
630,1.0,6
631,1.0,6
632,1.0,8
633,1.0,6
634,1.0,8
635,1.0,6
636,1.0,6
637,1.0,8
638,1.0,8
639,0.0,5
640,0.0,4
641,0.0,4
642,1.0,6
643,1.0,6
644,1.0,6
645,1.0,6
646,1.0,8
647,1.0,6
648,0.0,4
649,1.0,6
650,1.0,8
651,1.0,6
652,1.0,6
653,1.0,6
654,1.0,6
655,1.0,6
656,1.0,6
657,1.0,6
658,1.0,8
659,1.0,8
660,1.0,6
661,1.0,8
662,1.0,9
663,1.0,6
664,1.0,6
665,1.0,6
666,1.0,6
667,1.0,10
668,1.0,6
669,1.0,6
670,1.0,6
671,1.0,11
672,1.0,10
673,1.0,8
674,1.0,6
675,1.0,6
676,1.0,6
677,0.0,5
678,1.0,6
679,0.0,2
680,1.0,9
681,1.0,6
682,1.0,8
683,1.0,7
684,1.0,6
685,1.0,6
686,1.0,7
687,0.0,3
688,1.0,7
689,0.0,2
690,1.0,6
691,1.0,6
692,1.0,8
693,1.0,8
694,1.0,6
695,1.0,6
696,0.0,2
697,1.0,8
698,1.0,6
699,1.0,8
700,1.0,6
701,1.0,6
702,1.0,9
703,1.0,6
704,1.0,8
705,1.0,11
706,1.0,6
707,1.0,6
708,1.0,6
709,1.0,6
710,1.0,8
711,1.0,6
712,1.0,6
713,1.0,6
714,0.0,5
715,1.0,6
716,1.0,6
717,1.0,6
718,1.0,6
719,1.0,6
720,1.0,7
721,1.0,6
722,1.0,6
723,1.0,6
724,1.0,6
725,1.0,10
726,1.0,6
727,1.0,6
728,1.0,6
729,1.0,6
730,1.0,6
731,1.0,7
732,1.0,6
733,1.0,8
734,1.0,7
735,1.0,6
736,1.0,6
737,1.0,14
738,1.0,6
739,1.0,6
740,1.0,12
741,1.0,6
742,1.0,6
743,1.0,6
744,1.0,6
745,1.0,6
746,1.0,6
747,0.0,3
748,1.0,6
749,1.0,6
750,1.0,6
751,1.0,7
752,1.0,6
753,1.0,6
754,1.0,6
755,1.0,8
756,0.0,2
757,1.0,6
758,1.0,6
759,1.0,6
760,1.0,6
761,1.0,6
762,1.0,6
763,1.0,6
764,1.0,6
765,1.0,6
766,0.0,4
767,1.0,8
768,1.0,6
769,0.0,2
770,1.0,10
771,1.0,8
772,1.0,6
773,1.0,6
774,1.0,6
775,0.0,3
776,1.0,6
777,1.0,6
778,0.0,6
779,1.0,8
780,1.0,6
781,1.0,9
782,1.0,6
783,1.0,6
784,1.0,8
785,1.0,8
786,1.0,6
787,0.0,5
788,1.0,6
789,1.0,6
790,1.0,6
791,1.0,6
792,1.0,6
793,1.0,6
794,1.0,8
795,1.0,6
796,0.0,2
797,1.0,8
798,1.0,7
799,1.0,6
1 episodes rewards steps
2 0 0.0 20
3 1 0.0 14
4 2 0.0 13
5 3 0.0 9
6 4 0.0 10
7 5 0.0 6
8 6 0.0 11
9 7 0.0 6
10 8 0.0 3
11 9 0.0 9
12 10 0.0 11
13 11 0.0 22
14 12 0.0 5
15 13 0.0 16
16 14 0.0 4
17 15 0.0 9
18 16 0.0 18
19 17 0.0 2
20 18 0.0 4
21 19 0.0 8
22 20 0.0 7
23 21 0.0 4
24 22 0.0 22
25 23 0.0 15
26 24 0.0 5
27 25 0.0 16
28 26 0.0 7
29 27 0.0 19
30 28 0.0 22
31 29 0.0 16
32 30 0.0 11
33 31 0.0 22
34 32 0.0 28
35 33 0.0 23
36 34 0.0 4
37 35 0.0 11
38 36 0.0 8
39 37 0.0 15
40 38 0.0 5
41 39 0.0 7
42 40 0.0 9
43 41 0.0 4
44 42 0.0 3
45 43 0.0 6
46 44 0.0 41
47 45 0.0 9
48 46 0.0 23
49 47 0.0 3
50 48 1.0 38
51 49 0.0 29
52 50 0.0 17
53 51 0.0 4
54 52 0.0 2
55 53 0.0 25
56 54 0.0 6
57 55 0.0 2
58 56 0.0 30
59 57 0.0 6
60 58 0.0 7
61 59 0.0 11
62 60 0.0 9
63 61 0.0 8
64 62 0.0 23
65 63 0.0 10
66 64 0.0 3
67 65 0.0 5
68 66 0.0 7
69 67 0.0 18
70 68 0.0 8
71 69 0.0 26
72 70 0.0 6
73 71 0.0 14
74 72 0.0 4
75 73 0.0 25
76 74 0.0 21
77 75 0.0 13
78 76 0.0 4
79 77 0.0 29
80 78 0.0 21
81 79 0.0 6
82 80 0.0 6
83 81 0.0 11
84 82 0.0 21
85 83 0.0 9
86 84 0.0 9
87 85 0.0 7
88 86 0.0 48
89 87 0.0 23
90 88 0.0 160
91 89 0.0 7
92 90 0.0 10
93 91 0.0 24
94 92 0.0 4
95 93 0.0 7
96 94 0.0 17
97 95 0.0 87
98 96 0.0 28
99 97 0.0 7
100 98 0.0 5
101 99 0.0 12
102 100 0.0 14
103 101 0.0 6
104 102 0.0 13
105 103 0.0 93
106 104 0.0 4
107 105 0.0 50
108 106 0.0 8
109 107 0.0 12
110 108 0.0 43
111 109 0.0 30
112 110 0.0 15
113 111 0.0 19
114 112 0.0 182
115 113 0.0 40
116 114 0.0 88
117 115 0.0 19
118 116 0.0 30
119 117 0.0 27
120 118 0.0 5
121 119 0.0 87
122 120 0.0 9
123 121 0.0 64
124 122 0.0 27
125 123 0.0 68
126 124 0.0 81
127 125 0.0 86
128 126 0.0 227
129 127 0.0 41
130 128 0.0 70
131 129 0.0 27
132 130 0.0 6
133 131 0.0 18
134 132 0.0 38
135 133 0.0 26
136 134 0.0 36
137 135 0.0 3
138 136 0.0 61
139 137 0.0 105
140 138 0.0 38
141 139 0.0 18
142 140 0.0 33
143 141 0.0 29
144 142 0.0 49
145 143 0.0 88
146 144 0.0 22
147 145 0.0 65
148 146 0.0 36
149 147 0.0 30
150 148 0.0 58
151 149 0.0 43
152 150 0.0 53
153 151 0.0 43
154 152 0.0 13
155 153 0.0 8
156 154 0.0 39
157 155 0.0 29
158 156 0.0 26
159 157 0.0 60
160 158 0.0 153
161 159 0.0 116
162 160 0.0 53
163 161 0.0 54
164 162 0.0 8
165 163 0.0 58
166 164 0.0 3
167 165 0.0 47
168 166 0.0 16
169 167 0.0 21
170 168 0.0 44
171 169 0.0 29
172 170 0.0 104
173 171 0.0 158
174 172 0.0 83
175 173 0.0 26
176 174 0.0 24
177 175 0.0 10
178 176 0.0 12
179 177 0.0 40
180 178 0.0 25
181 179 0.0 18
182 180 0.0 60
183 181 0.0 203
184 182 0.0 23
185 183 0.0 54
186 184 0.0 71
187 185 0.0 19
188 186 0.0 118
189 187 0.0 26
190 188 0.0 41
191 189 0.0 41
192 190 0.0 60
193 191 0.0 31
194 192 0.0 34
195 193 0.0 35
196 194 0.0 59
197 195 0.0 51
198 196 0.0 426
199 197 0.0 79
200 198 0.0 40
201 199 0.0 17
202 200 0.0 79
203 201 0.0 126
204 202 0.0 61
205 203 0.0 25
206 204 0.0 18
207 205 0.0 27
208 206 0.0 13
209 207 0.0 187
210 208 0.0 160
211 209 0.0 32
212 210 0.0 108
213 211 0.0 164
214 212 0.0 17
215 213 0.0 82
216 214 0.0 194
217 215 0.0 7
218 216 0.0 36
219 217 0.0 156
220 218 0.0 17
221 219 0.0 183
222 220 0.0 243
223 221 0.0 87
224 222 0.0 42
225 223 0.0 80
226 224 0.0 54
227 225 0.0 82
228 226 0.0 97
229 227 0.0 65
230 228 0.0 83
231 229 0.0 159
232 230 0.0 178
233 231 0.0 104
234 232 0.0 21
235 233 0.0 118
236 234 0.0 80
237 235 0.0 170
238 236 0.0 94
239 237 0.0 235
240 238 0.0 13
241 239 0.0 31
242 240 0.0 134
243 241 0.0 32
244 242 0.0 58
245 243 0.0 38
246 244 0.0 28
247 245 0.0 159
248 246 0.0 182
249 247 0.0 51
250 248 0.0 25
251 249 0.0 73
252 250 0.0 56
253 251 0.0 55
254 252 0.0 38
255 253 0.0 292
256 254 0.0 319
257 255 0.0 100
258 256 0.0 84
259 257 0.0 24
260 258 0.0 17
261 259 0.0 159
262 260 0.0 25
263 261 0.0 73
264 262 0.0 130
265 263 0.0 111
266 264 0.0 65
267 265 1.0 58
268 266 0.0 47
269 267 0.0 48
270 268 0.0 13
271 269 0.0 100
272 270 0.0 38
273 271 0.0 111
274 272 0.0 226
275 273 0.0 38
276 274 0.0 83
277 275 0.0 42
278 276 0.0 199
279 277 0.0 83
280 278 0.0 28
281 279 0.0 46
282 280 0.0 262
283 281 0.0 123
284 282 0.0 91
285 283 0.0 53
286 284 0.0 19
287 285 0.0 26
288 286 0.0 93
289 287 0.0 38
290 288 0.0 22
291 289 0.0 43
292 290 0.0 163
293 291 0.0 25
294 292 0.0 59
295 293 0.0 71
296 294 0.0 20
297 295 0.0 115
298 296 0.0 248
299 297 0.0 66
300 298 0.0 58
301 299 0.0 129
302 300 0.0 122
303 301 0.0 47
304 302 0.0 60
305 303 0.0 79
306 304 1.0 137
307 305 0.0 27
308 306 1.0 93
309 307 0.0 46
310 308 1.0 83
311 309 1.0 8
312 310 1.0 6
313 311 1.0 6
314 312 0.0 4
315 313 1.0 6
316 314 0.0 2
317 315 1.0 6
318 316 1.0 6
319 317 1.0 6
320 318 1.0 6
321 319 1.0 8
322 320 0.0 5
323 321 1.0 6
324 322 1.0 7
325 323 0.0 5
326 324 1.0 6
327 325 1.0 6
328 326 1.0 8
329 327 1.0 6
330 328 1.0 6
331 329 1.0 6
332 330 1.0 7
333 331 1.0 6
334 332 1.0 6
335 333 0.0 3
336 334 1.0 7
337 335 0.0 4
338 336 1.0 6
339 337 1.0 6
340 338 1.0 7
341 339 1.0 6
342 340 1.0 6
343 341 1.0 7
344 342 1.0 7
345 343 1.0 7
346 344 1.0 6
347 345 1.0 6
348 346 1.0 6
349 347 1.0 6
350 348 1.0 6
351 349 1.0 6
352 350 1.0 6
353 351 1.0 7
354 352 0.0 4
355 353 1.0 8
356 354 1.0 8
357 355 1.0 7
358 356 1.0 6
359 357 1.0 8
360 358 1.0 6
361 359 1.0 6
362 360 1.0 7
363 361 1.0 6
364 362 1.0 6
365 363 1.0 8
366 364 1.0 7
367 365 1.0 6
368 366 1.0 6
369 367 0.0 3
370 368 1.0 11
371 369 1.0 6
372 370 1.0 8
373 371 0.0 2
374 372 1.0 6
375 373 1.0 6
376 374 1.0 6
377 375 1.0 6
378 376 1.0 8
379 377 1.0 6
380 378 1.0 7
381 379 1.0 6
382 380 1.0 7
383 381 1.0 6
384 382 1.0 8
385 383 0.0 2
386 384 1.0 6
387 385 1.0 7
388 386 1.0 6
389 387 1.0 6
390 388 1.0 10
391 389 1.0 7
392 390 1.0 6
393 391 1.0 6
394 392 1.0 6
395 393 1.0 6
396 394 1.0 6
397 395 1.0 7
398 396 0.0 4
399 397 1.0 7
400 398 1.0 6
401 399 1.0 8
402 400 0.0 3
403 401 1.0 6
404 402 1.0 6
405 403 1.0 6
406 404 1.0 6
407 405 0.0 2
408 406 1.0 6
409 407 1.0 6
410 408 1.0 6
411 409 1.0 6
412 410 1.0 6
413 411 1.0 7
414 412 1.0 6
415 413 1.0 6
416 414 1.0 7
417 415 1.0 6
418 416 1.0 6
419 417 1.0 6
420 418 1.0 6
421 419 1.0 6
422 420 1.0 6
423 421 1.0 6
424 422 1.0 8
425 423 1.0 6
426 424 1.0 8
427 425 1.0 7
428 426 1.0 6
429 427 0.0 3
430 428 1.0 6
431 429 1.0 7
432 430 1.0 6
433 431 1.0 6
434 432 1.0 6
435 433 1.0 10
436 434 1.0 6
437 435 1.0 6
438 436 1.0 6
439 437 1.0 6
440 438 1.0 10
441 439 1.0 6
442 440 1.0 8
443 441 1.0 8
444 442 1.0 7
445 443 1.0 6
446 444 0.0 5
447 445 0.0 2
448 446 1.0 8
449 447 1.0 6
450 448 1.0 10
451 449 1.0 6
452 450 1.0 8
453 451 1.0 10
454 452 1.0 6
455 453 1.0 6
456 454 1.0 6
457 455 1.0 10
458 456 1.0 6
459 457 0.0 4
460 458 1.0 6
461 459 1.0 6
462 460 1.0 6
463 461 1.0 15
464 462 1.0 6
465 463 1.0 6
466 464 1.0 6
467 465 1.0 6
468 466 1.0 6
469 467 1.0 6
470 468 1.0 8
471 469 1.0 6
472 470 1.0 7
473 471 1.0 6
474 472 1.0 6
475 473 1.0 8
476 474 1.0 6
477 475 1.0 6
478 476 1.0 8
479 477 1.0 8
480 478 1.0 6
481 479 1.0 6
482 480 1.0 6
483 481 1.0 10
484 482 1.0 6
485 483 1.0 6
486 484 1.0 6
487 485 1.0 6
488 486 1.0 6
489 487 1.0 6
490 488 1.0 6
491 489 1.0 8
492 490 1.0 8
493 491 1.0 6
494 492 1.0 6
495 493 0.0 2
496 494 1.0 6
497 495 1.0 6
498 496 1.0 6
499 497 1.0 8
500 498 1.0 6
501 499 1.0 6
502 500 1.0 6
503 501 1.0 6
504 502 1.0 6
505 503 1.0 6
506 504 1.0 6
507 505 1.0 6
508 506 1.0 6
509 507 1.0 7
510 508 0.0 3
511 509 1.0 7
512 510 1.0 6
513 511 1.0 6
514 512 1.0 6
515 513 0.0 2
516 514 1.0 6
517 515 1.0 8
518 516 1.0 6
519 517 1.0 6
520 518 1.0 6
521 519 1.0 6
522 520 1.0 9
523 521 1.0 6
524 522 1.0 6
525 523 1.0 6
526 524 1.0 6
527 525 1.0 6
528 526 1.0 6
529 527 1.0 9
530 528 1.0 7
531 529 0.0 4
532 530 1.0 6
533 531 1.0 8
534 532 1.0 11
535 533 1.0 6
536 534 1.0 6
537 535 1.0 6
538 536 1.0 6
539 537 1.0 6
540 538 1.0 8
541 539 1.0 6
542 540 1.0 6
543 541 1.0 8
544 542 1.0 7
545 543 1.0 6
546 544 1.0 8
547 545 1.0 6
548 546 0.0 5
549 547 1.0 9
550 548 1.0 8
551 549 1.0 8
552 550 1.0 6
553 551 1.0 8
554 552 1.0 8
555 553 1.0 6
556 554 0.0 5
557 555 0.0 3
558 556 0.0 2
559 557 1.0 8
560 558 1.0 6
561 559 1.0 6
562 560 1.0 6
563 561 1.0 6
564 562 1.0 6
565 563 1.0 6
566 564 1.0 6
567 565 1.0 6
568 566 1.0 6
569 567 1.0 6
570 568 1.0 6
571 569 1.0 6
572 570 1.0 6
573 571 1.0 6
574 572 0.0 2
575 573 1.0 6
576 574 0.0 4
577 575 1.0 6
578 576 1.0 6
579 577 1.0 6
580 578 1.0 6
581 579 1.0 6
582 580 1.0 8
583 581 0.0 5
584 582 1.0 6
585 583 1.0 6
586 584 1.0 6
587 585 1.0 6
588 586 1.0 6
589 587 1.0 6
590 588 0.0 3
591 589 1.0 6
592 590 1.0 6
593 591 1.0 6
594 592 0.0 2
595 593 1.0 6
596 594 0.0 4
597 595 1.0 6
598 596 1.0 6
599 597 1.0 6
600 598 1.0 6
601 599 1.0 8
602 600 1.0 6
603 601 1.0 7
604 602 1.0 6
605 603 1.0 7
606 604 1.0 6
607 605 0.0 2
608 606 1.0 6
609 607 1.0 6
610 608 0.0 5
611 609 0.0 3
612 610 0.0 3
613 611 1.0 6
614 612 0.0 5
615 613 1.0 8
616 614 1.0 8
617 615 1.0 6
618 616 1.0 6
619 617 1.0 7
620 618 1.0 6
621 619 1.0 6
622 620 1.0 6
623 621 1.0 6
624 622 1.0 6
625 623 1.0 8
626 624 0.0 2
627 625 1.0 6
628 626 1.0 6
629 627 1.0 6
630 628 1.0 6
631 629 1.0 6
632 630 1.0 6
633 631 1.0 6
634 632 1.0 8
635 633 1.0 6
636 634 1.0 8
637 635 1.0 6
638 636 1.0 6
639 637 1.0 8
640 638 1.0 8
641 639 0.0 5
642 640 0.0 4
643 641 0.0 4
644 642 1.0 6
645 643 1.0 6
646 644 1.0 6
647 645 1.0 6
648 646 1.0 8
649 647 1.0 6
650 648 0.0 4
651 649 1.0 6
652 650 1.0 8
653 651 1.0 6
654 652 1.0 6
655 653 1.0 6
656 654 1.0 6
657 655 1.0 6
658 656 1.0 6
659 657 1.0 6
660 658 1.0 8
661 659 1.0 8
662 660 1.0 6
663 661 1.0 8
664 662 1.0 9
665 663 1.0 6
666 664 1.0 6
667 665 1.0 6
668 666 1.0 6
669 667 1.0 10
670 668 1.0 6
671 669 1.0 6
672 670 1.0 6
673 671 1.0 11
674 672 1.0 10
675 673 1.0 8
676 674 1.0 6
677 675 1.0 6
678 676 1.0 6
679 677 0.0 5
680 678 1.0 6
681 679 0.0 2
682 680 1.0 9
683 681 1.0 6
684 682 1.0 8
685 683 1.0 7
686 684 1.0 6
687 685 1.0 6
688 686 1.0 7
689 687 0.0 3
690 688 1.0 7
691 689 0.0 2
692 690 1.0 6
693 691 1.0 6
694 692 1.0 8
695 693 1.0 8
696 694 1.0 6
697 695 1.0 6
698 696 0.0 2
699 697 1.0 8
700 698 1.0 6
701 699 1.0 8
702 700 1.0 6
703 701 1.0 6
704 702 1.0 9
705 703 1.0 6
706 704 1.0 8
707 705 1.0 11
708 706 1.0 6
709 707 1.0 6
710 708 1.0 6
711 709 1.0 6
712 710 1.0 8
713 711 1.0 6
714 712 1.0 6
715 713 1.0 6
716 714 0.0 5
717 715 1.0 6
718 716 1.0 6
719 717 1.0 6
720 718 1.0 6
721 719 1.0 6
722 720 1.0 7
723 721 1.0 6
724 722 1.0 6
725 723 1.0 6
726 724 1.0 6
727 725 1.0 10
728 726 1.0 6
729 727 1.0 6
730 728 1.0 6
731 729 1.0 6
732 730 1.0 6
733 731 1.0 7
734 732 1.0 6
735 733 1.0 8
736 734 1.0 7
737 735 1.0 6
738 736 1.0 6
739 737 1.0 14
740 738 1.0 6
741 739 1.0 6
742 740 1.0 12
743 741 1.0 6
744 742 1.0 6
745 743 1.0 6
746 744 1.0 6
747 745 1.0 6
748 746 1.0 6
749 747 0.0 3
750 748 1.0 6
751 749 1.0 6
752 750 1.0 6
753 751 1.0 7
754 752 1.0 6
755 753 1.0 6
756 754 1.0 6
757 755 1.0 8
758 756 0.0 2
759 757 1.0 6
760 758 1.0 6
761 759 1.0 6
762 760 1.0 6
763 761 1.0 6
764 762 1.0 6
765 763 1.0 6
766 764 1.0 6
767 765 1.0 6
768 766 0.0 4
769 767 1.0 8
770 768 1.0 6
771 769 0.0 2
772 770 1.0 10
773 771 1.0 8
774 772 1.0 6
775 773 1.0 6
776 774 1.0 6
777 775 0.0 3
778 776 1.0 6
779 777 1.0 6
780 778 0.0 6
781 779 1.0 8
782 780 1.0 6
783 781 1.0 9
784 782 1.0 6
785 783 1.0 6
786 784 1.0 8
787 785 1.0 8
788 786 1.0 6
789 787 0.0 5
790 788 1.0 6
791 789 1.0 6
792 790 1.0 6
793 791 1.0 6
794 792 1.0 6
795 793 1.0 6
796 794 1.0 8
797 795 1.0 6
798 796 0.0 2
799 797 1.0 8
800 798 1.0 7
801 799 1.0 6

View File

@@ -0,0 +1 @@
{"algo_name": "Q-learning", "env_name": "Racetrack-v0", "train_eps": 400, "test_eps": 20, "gamma": 0.9, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 300, "lr": 0.1, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/Racetrack-v0/20220826-224626/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/Racetrack-v0/20220826-224626/models/", "n_states": 4, "n_actions": 9}

Some files were not shown because too many files have changed in this diff Show More