Merge branch 'master' of github.com:datawhalechina/easy-rl
7
projects/.gitignore
vendored
@@ -2,4 +2,9 @@
|
||||
.ipynb_checkpoints
|
||||
__pycache__
|
||||
.vscode
|
||||
test.py
|
||||
test.py
|
||||
pseudocodes.aux
|
||||
pseudocodes.log
|
||||
pseudocodes.synctex.gz
|
||||
pseudocodes.out
|
||||
pseudocodes.toc
|
||||
@@ -1,318 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 定义模型\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"import paddle.nn as nn\n",
|
||||
"import paddle.nn.functional as F\n",
|
||||
"import parl\n",
|
||||
"\n",
|
||||
"class CartpoleModel(parl.Model):\n",
|
||||
" \"\"\" Linear network to solve Cartpole problem.\n",
|
||||
" Args:\n",
|
||||
" n_states (int): Dimension of observation space.\n",
|
||||
" n_actions (int): Dimension of action space.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, n_states, n_actions):\n",
|
||||
" super(CartpoleModel, self).__init__()\n",
|
||||
" hid1_size = 128\n",
|
||||
" hid2_size = 128\n",
|
||||
" self.fc1 = nn.Linear(n_states, hid1_size)\n",
|
||||
" self.fc2 = nn.Linear(hid1_size, hid2_size)\n",
|
||||
" self.fc3 = nn.Linear(hid2_size, n_actions)\n",
|
||||
"\n",
|
||||
" def forward(self, obs):\n",
|
||||
" h1 = F.relu(self.fc1(obs))\n",
|
||||
" h2 = F.relu(self.fc2(h1))\n",
|
||||
" Q = self.fc3(h2)\n",
|
||||
" return Q"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import parl\n",
|
||||
"import paddle\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CartpoleAgent(parl.Agent):\n",
|
||||
" \"\"\"Agent of Cartpole env.\n",
|
||||
" Args:\n",
|
||||
" algorithm(parl.Algorithm): algorithm used to solve the problem.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, algorithm, n_actions, e_greed=0.1, e_greed_decrement=0):\n",
|
||||
" super(CartpoleAgent, self).__init__(algorithm)\n",
|
||||
" assert isinstance(n_actions, int)\n",
|
||||
" self.n_actions = n_actions\n",
|
||||
"\n",
|
||||
" self.global_step = 0\n",
|
||||
" self.update_target_steps = 200\n",
|
||||
"\n",
|
||||
" self.e_greed = e_greed\n",
|
||||
" self.e_greed_decrement = e_greed_decrement\n",
|
||||
"\n",
|
||||
" def sample(self, obs):\n",
|
||||
" \"\"\"Sample an action `for exploration` when given an observation\n",
|
||||
" Args:\n",
|
||||
" obs(np.float32): shape of (n_states,)\n",
|
||||
" Returns:\n",
|
||||
" act(int): action\n",
|
||||
" \"\"\"\n",
|
||||
" sample = np.random.random()\n",
|
||||
" if sample < self.e_greed:\n",
|
||||
" act = np.random.randint(self.n_actions)\n",
|
||||
" else:\n",
|
||||
" if np.random.random() < 0.01:\n",
|
||||
" act = np.random.randint(self.n_actions)\n",
|
||||
" else:\n",
|
||||
" act = self.predict(obs)\n",
|
||||
" self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)\n",
|
||||
" return act\n",
|
||||
"\n",
|
||||
" def predict(self, obs):\n",
|
||||
" \"\"\"Predict an action when given an observation\n",
|
||||
" Args:\n",
|
||||
" obs(np.float32): shape of (n_states,)\n",
|
||||
" Returns:\n",
|
||||
" act(int): action\n",
|
||||
" \"\"\"\n",
|
||||
" obs = paddle.to_tensor(obs, dtype='float32')\n",
|
||||
" pred_q = self.alg.predict(obs)\n",
|
||||
" act = pred_q.argmax().numpy()[0]\n",
|
||||
" return act\n",
|
||||
"\n",
|
||||
" def learn(self, obs, act, reward, next_obs, terminal):\n",
|
||||
" \"\"\"Update model with an episode data\n",
|
||||
" Args:\n",
|
||||
" obs(np.float32): shape of (batch_size, n_states)\n",
|
||||
" act(np.int32): shape of (batch_size)\n",
|
||||
" reward(np.float32): shape of (batch_size)\n",
|
||||
" next_obs(np.float32): shape of (batch_size, n_states)\n",
|
||||
" terminal(np.float32): shape of (batch_size)\n",
|
||||
" Returns:\n",
|
||||
" loss(float)\n",
|
||||
" \"\"\"\n",
|
||||
" if self.global_step % self.update_target_steps == 0:\n",
|
||||
" self.alg.sync_target()\n",
|
||||
" self.global_step += 1\n",
|
||||
"\n",
|
||||
" act = np.expand_dims(act, axis=-1)\n",
|
||||
" reward = np.expand_dims(reward, axis=-1)\n",
|
||||
" terminal = np.expand_dims(terminal, axis=-1)\n",
|
||||
"\n",
|
||||
" obs = paddle.to_tensor(obs, dtype='float32')\n",
|
||||
" act = paddle.to_tensor(act, dtype='int32')\n",
|
||||
" reward = paddle.to_tensor(reward, dtype='float32')\n",
|
||||
" next_obs = paddle.to_tensor(next_obs, dtype='float32')\n",
|
||||
" terminal = paddle.to_tensor(terminal, dtype='float32')\n",
|
||||
" loss = self.alg.learn(obs, act, reward, next_obs, terminal)\n",
|
||||
" return loss.numpy()[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import gym\n",
|
||||
"import numpy as np\n",
|
||||
"import parl\n",
|
||||
"\n",
|
||||
"from parl.utils import logger, ReplayMemory\n",
|
||||
"from parl.algorithms import DQN\n",
|
||||
"\n",
|
||||
"LEARN_FREQ = 5 # training frequency\n",
|
||||
"MEMORY_SIZE = 200000\n",
|
||||
"MEMORY_WARMUP_SIZE = 200\n",
|
||||
"BATCH_SIZE = 64\n",
|
||||
"LEARNING_RATE = 0.0005\n",
|
||||
"GAMMA = 0.99\n",
|
||||
"\n",
|
||||
"# train an episode\n",
|
||||
"def run_train_episode(agent, env, rpm):\n",
|
||||
" total_reward = 0\n",
|
||||
" obs = env.reset()\n",
|
||||
" step = 0\n",
|
||||
" while True:\n",
|
||||
" step += 1\n",
|
||||
" action = agent.sample(obs)\n",
|
||||
" next_obs, reward, done, _ = env.step(action)\n",
|
||||
" rpm.append(obs, action, reward, next_obs, done)\n",
|
||||
"\n",
|
||||
" # train model\n",
|
||||
" if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):\n",
|
||||
" # s,a,r,s',done\n",
|
||||
" (batch_obs, batch_action, batch_reward, batch_next_obs,\n",
|
||||
" batch_done) = rpm.sample_batch(BATCH_SIZE)\n",
|
||||
" train_loss = agent.learn(batch_obs, batch_action, batch_reward,\n",
|
||||
" batch_next_obs, batch_done)\n",
|
||||
"\n",
|
||||
" total_reward += reward\n",
|
||||
" obs = next_obs\n",
|
||||
" if done:\n",
|
||||
" break\n",
|
||||
" return total_reward\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# evaluate 5 episodes\n",
|
||||
"def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):\n",
|
||||
" eval_reward = []\n",
|
||||
" for i in range(eval_episodes):\n",
|
||||
" obs = env.reset()\n",
|
||||
" episode_reward = 0\n",
|
||||
" while True:\n",
|
||||
" action = agent.predict(obs)\n",
|
||||
" obs, reward, done, _ = env.step(action)\n",
|
||||
" episode_reward += reward\n",
|
||||
" if render:\n",
|
||||
" env.render()\n",
|
||||
" if done:\n",
|
||||
" break\n",
|
||||
" eval_reward.append(episode_reward)\n",
|
||||
" return np.mean(eval_reward)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def main(args):\n",
|
||||
" env = gym.make('CartPole-v0')\n",
|
||||
" n_states = env.observation_space.shape[0]\n",
|
||||
" n_actions = env.action_space.n\n",
|
||||
" logger.info('n_states {}, n_actions {}'.format(n_states, n_actions))\n",
|
||||
"\n",
|
||||
" # set action_shape = 0 while in discrete control environment\n",
|
||||
" rpm = ReplayMemory(MEMORY_SIZE, n_states, 0)\n",
|
||||
"\n",
|
||||
" # build an agent\n",
|
||||
" model = CartpoleModel(n_states=n_states, n_actions=n_actions)\n",
|
||||
" alg = DQN(model, gamma=GAMMA, lr=LEARNING_RATE)\n",
|
||||
" agent = CartpoleAgent(\n",
|
||||
" alg, n_actions=n_actions, e_greed=0.1, e_greed_decrement=1e-6)\n",
|
||||
"\n",
|
||||
" # warmup memory\n",
|
||||
" while len(rpm) < MEMORY_WARMUP_SIZE:\n",
|
||||
" run_train_episode(agent, env, rpm)\n",
|
||||
"\n",
|
||||
" max_episode = args.max_episode\n",
|
||||
"\n",
|
||||
" # start training\n",
|
||||
" episode = 0\n",
|
||||
" while episode < max_episode:\n",
|
||||
" # train part\n",
|
||||
" for i in range(50):\n",
|
||||
" total_reward = run_train_episode(agent, env, rpm)\n",
|
||||
" episode += 1\n",
|
||||
"\n",
|
||||
" # test part\n",
|
||||
" eval_reward = run_evaluate_episodes(agent, env, render=False)\n",
|
||||
" logger.info('episode:{} e_greed:{} Test reward:{}'.format(\n",
|
||||
" episode, agent.e_greed, eval_reward))\n",
|
||||
"\n",
|
||||
" # save the parameters to ./model.ckpt\n",
|
||||
" save_path = './model.ckpt'\n",
|
||||
" agent.save(save_path)\n",
|
||||
"\n",
|
||||
" # save the model and parameters of policy network for inference\n",
|
||||
" save_inference_path = './inference_model'\n",
|
||||
" input_shapes = [[None, env.observation_space.shape[0]]]\n",
|
||||
" input_dtypes = ['float32']\n",
|
||||
" agent.save_inference_model(save_inference_path, input_shapes, input_dtypes)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:64]\u001b[0m obs_dim 4, act_dim 2\n",
|
||||
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:92]\u001b[0m episode:50 e_greed:0.0988929999999989 Test reward:18.4\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:100 e_greed:0.09794799999999795 Test reward:9.6\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:150 e_greed:0.0973899999999974 Test reward:37.8\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:200 e_greed:0.09684299999999685 Test reward:8.8\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:250 e_greed:0.09635499999999636 Test reward:9.4\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:300 e_greed:0.09585299999999586 Test reward:9.2\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:350 e_greed:0.09535799999999536 Test reward:9.2\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:400 e_greed:0.09486399999999487 Test reward:10.0\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:450 e_greed:0.09435299999999436 Test reward:9.2\n",
|
||||
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:500 e_greed:0.09384899999999385 Test reward:9.4\n",
|
||||
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:550 e_greed:0.09302299999999303 Test reward:69.0\n",
|
||||
"\u001b[32m[08-01 21:48:25 MainThread @3996942455.py:92]\u001b[0m episode:600 e_greed:0.08774199999998775 Test reward:141.2\n",
|
||||
"\u001b[32m[08-01 21:48:30 MainThread @3996942455.py:92]\u001b[0m episode:650 e_greed:0.0791019999999791 Test reward:184.0\n",
|
||||
"\u001b[32m[08-01 21:48:35 MainThread @3996942455.py:92]\u001b[0m episode:700 e_greed:0.07011299999997012 Test reward:182.0\n",
|
||||
"\u001b[32m[08-01 21:48:40 MainThread @3996942455.py:92]\u001b[0m episode:750 e_greed:0.06089099999996089 Test reward:197.4\n",
|
||||
"\u001b[32m[08-01 21:48:45 MainThread @3996942455.py:92]\u001b[0m episode:800 e_greed:0.05139199999995139 Test reward:183.4\n",
|
||||
"\u001b[32m[08-01 21:48:50 MainThread @3996942455.py:92]\u001b[0m episode:850 e_greed:0.042255999999942256 Test reward:153.0\n",
|
||||
"\u001b[32m[08-01 21:48:55 MainThread @3996942455.py:92]\u001b[0m episode:900 e_greed:0.033495999999933496 Test reward:192.6\n",
|
||||
"\u001b[32m[08-01 21:49:00 MainThread @3996942455.py:92]\u001b[0m episode:950 e_greed:0.024318999999924318 Test reward:166.6\n",
|
||||
"\u001b[32m[08-01 21:49:06 MainThread @3996942455.py:92]\u001b[0m episode:1000 e_greed:0.014873999999916176 Test reward:187.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import argparse\n",
|
||||
"parser = argparse.ArgumentParser()\n",
|
||||
"parser.add_argument(\n",
|
||||
" '--max_episode',\n",
|
||||
" type=int,\n",
|
||||
" default=1000,\n",
|
||||
" help='stop condition: number of max episode')\n",
|
||||
"args = parser.parse_args(args=[])\n",
|
||||
"\n",
|
||||
"main(args)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.7.12 ('rl_tutorials')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "4f613f1ab80ec98dc1b91d6e720de51301598a187317378e53e49b773c1123dd"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
[PARL](https://github.com/PaddlePaddle/PARL)是一个高性能、灵活的强化学习框架,由百度AI Studio开发。
|
||||
|
||||
## 安装
|
||||
|
||||
1. 安装parl,参考[PARL Github](https://github.com/PaddlePaddle/PARL)
|
||||
2. 安装paddlepaddle:```pip install paddlepaddle```
|
||||
|
||||
## 常见问题
|
||||
|
||||
```jupyter-client 7.3.1 requires pyzmq>=22.3, but you have pyzmq 18.1.1 which is incompatible.```:
|
||||
```pip install -U pyzmq```
|
||||
@@ -11,7 +11,6 @@
|
||||
项目内容主要包含以下几个部分:
|
||||
* [Jupyter Notebook](./notebooks/):使用Notebook写的算法,有比较详细的实战引导,推荐新手食用
|
||||
* [codes](./codes/):这些是基于Python脚本写的算法,风格比较接近实际项目的写法,推荐有一定代码基础的人阅读,下面会说明其具体的一些架构
|
||||
* [parl](./PARL/):应业务需求,写了一些基于百度飞浆平台和```parl```模块的RL实例
|
||||
* [附件](./assets/):目前包含强化学习各算法的中文伪代码
|
||||
|
||||
|
||||
@@ -23,15 +22,15 @@
|
||||
|
||||
注:点击对应的名称会跳到[codes](./codes/)下对应的算法中,其他版本还请读者自行翻阅
|
||||
|
||||
| 算法名称 | 参考文献 | 备注 |
|
||||
| :-----------------------: | :----------------------------------------------------------: | :--: |
|
||||
| | | |
|
||||
| DQN-CNN | | 待更 |
|
||||
| [SoftQ](codes/SoftQ) | [Soft Q-learning paper](https://arxiv.org/abs/1702.08165) | |
|
||||
| [SAC](codes/SAC) | [SAC paper](https://arxiv.org/pdf/1812.05905.pdf) | |
|
||||
| [SAC-Discrete](codes/SAC) | [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf) | |
|
||||
| SAC-S | [SAC-S paper](https://arxiv.org/abs/1801.01290) | |
|
||||
| DSAC | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 |
|
||||
| 算法名称 | 参考文献 | 备注 |
|
||||
| :-------------------------------------: | :----------------------------------------------------------: | :--: |
|
||||
| [Policy Gradient](codes/PolicyGradient) | [Policy Gradient paper](https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf) | |
|
||||
| DQN-CNN | | 待更 |
|
||||
| [SoftQ](codes/SoftQ) | [Soft Q-learning paper](https://arxiv.org/abs/1702.08165) | |
|
||||
| [SAC](codes/SAC) | [SAC paper](https://arxiv.org/pdf/1812.05905.pdf) | |
|
||||
| [SAC-Discrete](codes/SAC) | [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf) | |
|
||||
| SAC-S | [SAC-S paper](https://arxiv.org/abs/1801.01290) | |
|
||||
| DSAC | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 |
|
||||
|
||||
## 3、算法环境
|
||||
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
\relax
|
||||
\providecommand\hyper@newdestlabel[2]{}
|
||||
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
|
||||
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
|
||||
\global\let\oldcontentsline\contentsline
|
||||
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
|
||||
\global\let\oldnewlabel\newlabel
|
||||
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
|
||||
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
|
||||
\AtEndDocument{\ifx\hyper@anchor\@undefined
|
||||
\let\contentsline\oldcontentsline
|
||||
\let\newlabel\oldnewlabel
|
||||
\fi}
|
||||
\fi}
|
||||
\global\let\hyper@last\relax
|
||||
\gdef\HyperFirstAtBeginDocument#1{#1}
|
||||
\providecommand*\HyPL@Entry[1]{}
|
||||
\HyPL@Entry{0<</S/D>>}
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {1}模版备用}{2}{section.1}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{2}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {2}Q learning算法}{3}{section.2}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{3}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {3}Sarsa算法}{4}{section.3}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{4}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{5}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{6}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{7}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{8}{algorithm.}\protected@file@percent }
|
||||
\@writefile{toc}{\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{9}{algorithm.}\protected@file@percent }
|
||||
\gdef \@abspage@last{9}
|
||||
@@ -1,570 +0,0 @@
|
||||
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 23 AUG 2022 19:26
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
file:line:error style messages enabled.
|
||||
%&-line parsing enabled.
|
||||
**/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes
|
||||
(/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes.tex
|
||||
LaTeX2e <2020-10-01> patch level 4
|
||||
L3 programming layer <2021-02-18> (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexart.cls (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexbackend.cfg
|
||||
File: ctexbackend.cfg 2021/03/14 v2.5.6 Backend configuration file (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3kernel/expl3.sty
|
||||
Package: expl3 2021-02-18 L3 programming layer (loader)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
|
||||
File: l3backend-xetex.def 2021-03-18 L3 backend support: XeTeX
|
||||
(|extractbb --version)
|
||||
\c__kernel_sys_dvipdfmx_version_int=\count175
|
||||
\l__color_backend_stack_int=\count176
|
||||
\g__color_backend_stack_int=\count177
|
||||
\g__graphics_track_int=\count178
|
||||
\l__pdf_internal_box=\box47
|
||||
\g__pdf_backend_object_int=\count179
|
||||
\g__pdf_backend_annotation_int=\count180
|
||||
\g__pdf_backend_link_int=\count181
|
||||
))
|
||||
Document Class: ctexart 2021/03/14 v2.5.6 Chinese adapter for class article (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-2020-10-01.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-generic.tex))) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.sty
|
||||
Package: l3keys2e 2021-03-12 LaTeX2e option processing using LaTeX3 keys
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexhook.sty
|
||||
Package: ctexhook 2021/03/14 v2.5.6 Document and package hooks (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexpatch.sty
|
||||
Package: ctexpatch 2021/03/14 v2.5.6 Patching commands (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/fix-cm.sty
|
||||
Package: fix-cm 2015/01/14 v1.1t fixes to LaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/ts1enc.def
|
||||
File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
|
||||
LaTeX Font Info: Redeclaring font encoding TS1 on input line 47.
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel.sty
|
||||
Package: everysel 2021/01/20 v2.1 EverySelectfont Package (MS)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel-2011-10-28.sty))
|
||||
\l__ctex_tmp_int=\count182
|
||||
\l__ctex_tmp_box=\box48
|
||||
\l__ctex_tmp_dim=\dimen138
|
||||
\g__ctex_section_depth_int=\count183
|
||||
\g__ctex_font_size_int=\count184
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexopts.cfg
|
||||
File: ctexopts.cfg 2021/03/14 v2.5.6 Option configuration file (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/article.cls
|
||||
Document Class: article 2020/04/10 v1.4m Standard LaTeX document class
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/size11.clo
|
||||
File: size11.clo 2020/04/10 v1.4m Standard LaTeX file (size option)
|
||||
)
|
||||
\c@part=\count185
|
||||
\c@section=\count186
|
||||
\c@subsection=\count187
|
||||
\c@subsubsection=\count188
|
||||
\c@paragraph=\count189
|
||||
\c@subparagraph=\count190
|
||||
\c@figure=\count191
|
||||
\c@table=\count192
|
||||
\abovecaptionskip=\skip47
|
||||
\belowcaptionskip=\skip48
|
||||
\bibindent=\dimen139
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/engine/ctex-engine-xetex.def
|
||||
File: ctex-engine-xetex.def 2021/03/14 v2.5.6 XeLaTeX adapter (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.sty
|
||||
Package: xeCJK 2020/10/19 v3.8.6 Typesetting CJK scripts with XeLaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xtemplate/xtemplate.sty
|
||||
Package: xtemplate 2021-03-12 L3 Experimental prototype document functions
|
||||
\l__xtemplate_tmp_dim=\dimen140
|
||||
\l__xtemplate_tmp_int=\count193
|
||||
\l__xtemplate_tmp_muskip=\muskip16
|
||||
\l__xtemplate_tmp_skip=\skip49
|
||||
)
|
||||
\l__xeCJK_tmp_int=\count194
|
||||
\l__xeCJK_tmp_box=\box49
|
||||
\l__xeCJK_tmp_dim=\dimen141
|
||||
\l__xeCJK_tmp_skip=\skip50
|
||||
\g__xeCJK_space_factor_int=\count195
|
||||
\l__xeCJK_begin_int=\count196
|
||||
\l__xeCJK_end_int=\count197
|
||||
\c__xeCJK_CJK_class_int=\XeTeXcharclass1
|
||||
\c__xeCJK_FullLeft_class_int=\XeTeXcharclass2
|
||||
\c__xeCJK_FullRight_class_int=\XeTeXcharclass3
|
||||
\c__xeCJK_HalfLeft_class_int=\XeTeXcharclass4
|
||||
\c__xeCJK_HalfRight_class_int=\XeTeXcharclass5
|
||||
\c__xeCJK_NormalSpace_class_int=\XeTeXcharclass6
|
||||
\c__xeCJK_CM_class_int=\XeTeXcharclass7
|
||||
\c__xeCJK_HangulJamo_class_int=\XeTeXcharclass8
|
||||
\l__xeCJK_last_skip=\skip51
|
||||
\g__xeCJK_node_int=\count198
|
||||
\c__xeCJK_CJK_node_dim=\dimen142
|
||||
\c__xeCJK_CJK-space_node_dim=\dimen143
|
||||
\c__xeCJK_default_node_dim=\dimen144
|
||||
\c__xeCJK_default-space_node_dim=\dimen145
|
||||
\c__xeCJK_CJK-widow_node_dim=\dimen146
|
||||
\c__xeCJK_normalspace_node_dim=\dimen147
|
||||
\l__xeCJK_ccglue_skip=\skip52
|
||||
\l__xeCJK_ecglue_skip=\skip53
|
||||
\l__xeCJK_punct_kern_skip=\skip54
|
||||
\l__xeCJK_last_penalty_int=\count199
|
||||
\l__xeCJK_last_bound_dim=\dimen148
|
||||
\l__xeCJK_last_kern_dim=\dimen149
|
||||
\l__xeCJK_widow_penalty_int=\count266
|
||||
|
||||
Package xtemplate Info: Declaring object type 'xeCJK/punctuation' taking 0
|
||||
(xtemplate) argument(s) on line 2341.
|
||||
|
||||
\l__xeCJK_fixed_punct_width_dim=\dimen150
|
||||
\l__xeCJK_mixed_punct_width_dim=\dimen151
|
||||
\l__xeCJK_middle_punct_width_dim=\dimen152
|
||||
\l__xeCJK_fixed_margin_width_dim=\dimen153
|
||||
\l__xeCJK_mixed_margin_width_dim=\dimen154
|
||||
\l__xeCJK_middle_margin_width_dim=\dimen155
|
||||
\l__xeCJK_bound_punct_width_dim=\dimen156
|
||||
\l__xeCJK_bound_margin_width_dim=\dimen157
|
||||
\l__xeCJK_margin_minimum_dim=\dimen158
|
||||
\l__xeCJK_kerning_total_width_dim=\dimen159
|
||||
\l__xeCJK_same_align_margin_dim=\dimen160
|
||||
\l__xeCJK_different_align_margin_dim=\dimen161
|
||||
\l__xeCJK_kerning_margin_width_dim=\dimen162
|
||||
\l__xeCJK_kerning_margin_minimum_dim=\dimen163
|
||||
\l__xeCJK_bound_dim=\dimen164
|
||||
\l__xeCJK_reverse_bound_dim=\dimen165
|
||||
\l__xeCJK_margin_dim=\dimen166
|
||||
\l__xeCJK_minimum_bound_dim=\dimen167
|
||||
\l__xeCJK_kerning_margin_dim=\dimen168
|
||||
\g__xeCJK_family_int=\count267
|
||||
\l__xeCJK_fam_int=\count268
|
||||
\g__xeCJK_fam_allocation_int=\count269
|
||||
\l__xeCJK_verb_case_int=\count270
|
||||
\l__xeCJK_verb_exspace_skip=\skip55
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.sty
|
||||
Package: fontspec 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
|
||||
Package: fontspec-xetex 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
|
||||
\l__fontspec_script_int=\count271
|
||||
\l__fontspec_language_int=\count272
|
||||
\l__fontspec_strnum_int=\count273
|
||||
\l__fontspec_tmp_int=\count274
|
||||
\l__fontspec_tmpa_int=\count275
|
||||
\l__fontspec_tmpb_int=\count276
|
||||
\l__fontspec_tmpc_int=\count277
|
||||
\l__fontspec_em_int=\count278
|
||||
\l__fontspec_emdef_int=\count279
|
||||
\l__fontspec_strong_int=\count280
|
||||
\l__fontspec_strongdef_int=\count281
|
||||
\l__fontspec_tmpa_dim=\dimen169
|
||||
\l__fontspec_tmpb_dim=\dimen170
|
||||
\l__fontspec_tmpc_dim=\dimen171
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/fontenc.sty
|
||||
Package: fontenc 2020/08/10 v2.0s Standard LaTeX package
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
|
||||
File: xeCJK.cfg 2020/10/19 v3.8.6 Configuration file for xeCJK package
|
||||
))
|
||||
\ccwd=\dimen172
|
||||
\l__ctex_ccglue_skip=\skip56
|
||||
)
|
||||
\l__ctex_ziju_dim=\dimen173
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber.sty
|
||||
Package: zhnumber 2020/05/01 v2.8 Typesetting numbers with Chinese glyphs
|
||||
\l__zhnum_scale_int=\count282
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber-utf8.cfg
|
||||
File: zhnumber-utf8.cfg 2020/05/01 v2.8 Chinese numerals with UTF8 encoding
|
||||
))
|
||||
\l__ctex_heading_skip=\skip57
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/scheme/ctex-scheme-chinese-article.def
|
||||
File: ctex-scheme-chinese-article.def 2021/03/14 v2.5.6 Chinese scheme for article (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex-name-utf8.cfg
|
||||
File: ctex-name-utf8.cfg 2021/03/14 v2.5.6 Caption with encoding UTF-8 (CTEX)
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-mac.def
|
||||
File: ctex-fontset-mac.def 2021/03/14 v2.5.6 macOS fonts definition (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-macnew.def
|
||||
File: ctex-fontset-macnew.def 2021/03/14 v2.5.6 macOS fonts definition for El Capitan or later version (CTEX)
|
||||
|
||||
|
||||
Package fontspec Warning: Font "Songti SC Light" does not contain requested
|
||||
(fontspec) Script "CJK".
|
||||
|
||||
|
||||
Package fontspec Info: Font family 'SongtiSCLight(0)' created for font 'Songti
|
||||
(fontspec) SC Light' with options
|
||||
(fontspec) [Script={CJK},BoldItalicFont={Kaiti SC
|
||||
(fontspec) Bold},BoldFont={Songti SC Bold},ItalicFont={Kaiti SC}].
|
||||
(fontspec)
|
||||
(fontspec) This font family consists of the following NFSS
|
||||
(fontspec) series/shapes:
|
||||
(fontspec)
|
||||
(fontspec) - 'normal' (m/n) with NFSS spec.: <->"Songti SC
|
||||
(fontspec) Light/OT:language=dflt;"
|
||||
(fontspec) - 'small caps' (m/sc) with NFSS spec.:
|
||||
(fontspec) - 'bold' (b/n) with NFSS spec.: <->"Songti SC
|
||||
(fontspec) Bold/OT:language=dflt;"
|
||||
(fontspec) - 'bold small caps' (b/sc) with NFSS spec.:
|
||||
(fontspec) - 'italic' (m/it) with NFSS spec.: <->"Kaiti
|
||||
(fontspec) SC/OT:language=dflt;"
|
||||
(fontspec) - 'italic small caps' (m/scit) with NFSS spec.:
|
||||
(fontspec) - 'bold italic' (b/it) with NFSS spec.: <->"Kaiti SC
|
||||
(fontspec) Bold/OT:language=dflt;"
|
||||
(fontspec) - 'bold italic small caps' (b/scit) with NFSS spec.:
|
||||
|
||||
))) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex.cfg
|
||||
File: ctex.cfg 2021/03/14 v2.5.6 Configuration file (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithm.sty
|
||||
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
|
||||
Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/float/float.sty
|
||||
Package: float 2001/11/08 v1.3d Float enhancements (AL)
|
||||
\c@float@type=\count283
|
||||
\float@exts=\toks15
|
||||
\float@box=\box50
|
||||
\@float@everytoks=\toks16
|
||||
\@floatcapt=\box51
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/ifthen.sty
|
||||
Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
|
||||
)
|
||||
\@float@every@algorithm=\toks17
|
||||
\c@algorithm=\count284
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithmic.sty
|
||||
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
|
||||
Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/graphics/keyval.sty
|
||||
Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
|
||||
\KV@toks@=\toks18
|
||||
)
|
||||
\c@ALC@unique=\count285
|
||||
\c@ALC@line=\count286
|
||||
\c@ALC@rem=\count287
|
||||
\c@ALC@depth=\count288
|
||||
\ALC@tlm=\skip58
|
||||
\algorithmicindent=\skip59
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amssymb.sty
|
||||
Package: amssymb 2013/01/14 v3.01 AMS font symbols
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amsfonts.sty
|
||||
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
|
||||
\@emptytoks=\toks19
|
||||
\symAMSa=\mathgroup4
|
||||
\symAMSb=\mathgroup5
|
||||
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
|
||||
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsmath.sty
|
||||
Package: amsmath 2020/09/23 v2.17i AMS math features
|
||||
\@mathmargin=\skip60
|
||||
|
||||
For additional information on amsmath, use the `?' option.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amstext.sty
|
||||
Package: amstext 2000/06/29 v2.01 AMS text
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsgen.sty
|
||||
File: amsgen.sty 1999/11/30 v2.0 generic functions
|
||||
\@emptytoks=\toks20
|
||||
\ex@=\dimen174
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsbsy.sty
|
||||
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
|
||||
\pmbraise@=\dimen175
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsopn.sty
|
||||
Package: amsopn 2016/03/08 v2.02 operator names
|
||||
)
|
||||
\inf@bad=\count289
|
||||
LaTeX Info: Redefining \frac on input line 234.
|
||||
\uproot@=\count290
|
||||
\leftroot@=\count291
|
||||
LaTeX Info: Redefining \overline on input line 399.
|
||||
\classnum@=\count292
|
||||
\DOTSCASE@=\count293
|
||||
LaTeX Info: Redefining \ldots on input line 496.
|
||||
LaTeX Info: Redefining \dots on input line 499.
|
||||
LaTeX Info: Redefining \cdots on input line 620.
|
||||
\Mathstrutbox@=\box52
|
||||
\strutbox@=\box53
|
||||
\big@size=\dimen176
|
||||
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
|
||||
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
|
||||
\macc@depth=\count294
|
||||
\c@MaxMatrixCols=\count295
|
||||
\dotsspace@=\muskip17
|
||||
\c@parentequation=\count296
|
||||
\dspbrk@lvl=\count297
|
||||
\tag@help=\toks21
|
||||
\row@=\count298
|
||||
\column@=\count299
|
||||
\maxfields@=\count300
|
||||
\andhelp@=\toks22
|
||||
\eqnshift@=\dimen177
|
||||
\alignsep@=\dimen178
|
||||
\tagshift@=\dimen179
|
||||
\tagwidth@=\dimen180
|
||||
\totwidth@=\dimen181
|
||||
\lineht@=\dimen182
|
||||
\@envbody=\toks23
|
||||
\multlinegap=\skip61
|
||||
\multlinetaggap=\skip62
|
||||
\mathdisplay@stack=\toks24
|
||||
LaTeX Info: Redefining \[ on input line 2923.
|
||||
LaTeX Info: Redefining \] on input line 2924.
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hyperref.sty
|
||||
Package: hyperref 2021-02-27 v7.00k Hypertext links for LaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
|
||||
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/iftex/iftex.sty
|
||||
Package: iftex 2020/03/06 v1.0d TeX engine tests
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
|
||||
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/generic/infwarerr/infwarerr.sty
|
||||
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
|
||||
)
|
||||
Package pdftexcmds Info: \pdf@primitive is available.
|
||||
Package pdftexcmds Info: \pdf@ifprimitive is available.
|
||||
Package pdftexcmds Info: \pdfdraftmode not found.
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/kvsetkeys/kvsetkeys.sty
|
||||
Package: kvsetkeys 2019/12/15 v1.18 Key value parser (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
|
||||
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/pdfescape/pdfescape.sty
|
||||
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hycolor/hycolor.sty
|
||||
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
|
||||
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/auxhook/auxhook.sty
|
||||
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/kvoptions/kvoptions.sty
|
||||
Package: kvoptions 2020-10-07 v3.14 Key value format for package options (HO)
|
||||
)
|
||||
\@linkdim=\dimen183
|
||||
\Hy@linkcounter=\count301
|
||||
\Hy@pagecounter=\count302
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/pd1enc.def
|
||||
File: pd1enc.def 2021-02-27 v7.00k Hyperref: PDFDocEncoding definition (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hyperref-langpatches.def
|
||||
File: hyperref-langpatches.def 2021-02-27 v7.00k Hyperref: patches for babel languages
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/intcalc/intcalc.sty
|
||||
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/etexcmds/etexcmds.sty
|
||||
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
|
||||
)
|
||||
\Hy@SavedSpaceFactor=\count303
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/puenc.def
|
||||
File: puenc.def 2021-02-27 v7.00k Hyperref: PDF Unicode definition (HO)
|
||||
)
|
||||
Package hyperref Info: Option `unicode' set `true' on input line 4073.
|
||||
Package hyperref Info: Hyper figures OFF on input line 4192.
|
||||
Package hyperref Info: Link nesting OFF on input line 4197.
|
||||
Package hyperref Info: Hyper index ON on input line 4200.
|
||||
Package hyperref Info: Plain pages OFF on input line 4207.
|
||||
Package hyperref Info: Backreferencing OFF on input line 4212.
|
||||
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
|
||||
Package hyperref Info: Bookmarks ON on input line 4445.
|
||||
\c@Hy@tempcnt=\count304
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/url/url.sty
|
||||
\Urlmuskip=\muskip18
|
||||
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
|
||||
)
|
||||
LaTeX Info: Redefining \url on input line 4804.
|
||||
\XeTeXLinkMargin=\dimen184
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/generic/bitset/bitset.sty
|
||||
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
|
||||
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
|
||||
))
|
||||
\Fld@menulength=\count305
|
||||
\Field@Width=\dimen185
|
||||
\Fld@charsize=\dimen186
|
||||
Package hyperref Info: Hyper figures OFF on input line 6075.
|
||||
Package hyperref Info: Link nesting OFF on input line 6080.
|
||||
Package hyperref Info: Hyper index ON on input line 6083.
|
||||
Package hyperref Info: backreferencing OFF on input line 6090.
|
||||
Package hyperref Info: Link coloring OFF on input line 6095.
|
||||
Package hyperref Info: Link coloring with OCG OFF on input line 6100.
|
||||
Package hyperref Info: PDF/A mode OFF on input line 6105.
|
||||
LaTeX Info: Redefining \ref on input line 6145.
|
||||
LaTeX Info: Redefining \pageref on input line 6149.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/atbegshi-ltx.sty
|
||||
Package: atbegshi-ltx 2020/08/17 v1.0a Emulation of the original atbegshi package
|
||||
with kernel methods
|
||||
)
|
||||
\Hy@abspage=\count306
|
||||
\c@Item=\count307
|
||||
\c@Hfootnote=\count308
|
||||
)
|
||||
Package hyperref Info: Driver (autodetected): hxetex.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hxetex.def
|
||||
File: hxetex.def 2021-02-27 v7.00k Hyperref driver for XeTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/generic/stringenc/stringenc.sty
|
||||
Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO)
|
||||
)
|
||||
\pdfm@box=\box54
|
||||
\c@Hy@AnnotLevel=\count309
|
||||
\HyField@AnnotCount=\count310
|
||||
\Fld@listcount=\count311
|
||||
\c@bookmark@seq@number=\count312
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
|
||||
Package: rerunfilecheck 2019/12/05 v1.9 Rerun checks for auxiliary files (HO)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/atveryend-ltx.sty
|
||||
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atvery package
|
||||
with kernel methods
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
|
||||
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
|
||||
)
|
||||
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 286.
|
||||
)
|
||||
\Hy@SectionHShift=\skip63
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/setspace/setspace.sty
|
||||
Package: setspace 2011/12/19 v6.7a set line spacing
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/titlesec/titlesec.sty
|
||||
Package: titlesec 2019/10/16 v2.13 Sectioning titles
|
||||
\ttl@box=\box55
|
||||
\beforetitleunit=\skip64
|
||||
\aftertitleunit=\skip65
|
||||
\ttl@plus=\dimen187
|
||||
\ttl@minus=\dimen188
|
||||
\ttl@toksa=\toks25
|
||||
\titlewidth=\dimen189
|
||||
\titlewidthlast=\dimen190
|
||||
\titlewidthfirst=\dimen191
|
||||
) (./pseudocodes.aux)
|
||||
\openout1 = `pseudocodes.aux'.
|
||||
|
||||
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 14.
|
||||
LaTeX Font Info: ... okay on input line 14.
|
||||
ABD: EverySelectfont initializing macros
|
||||
LaTeX Info: Redefining \selectfont on input line 14.
|
||||
|
||||
Package fontspec Info: Adjusting the maths setup (use [no-math] to avoid
|
||||
(fontspec) this).
|
||||
|
||||
\symlegacymaths=\mathgroup6
|
||||
LaTeX Font Info: Overwriting symbol font `legacymaths' in version `bold'
|
||||
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \acute on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \grave on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \ddot on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \tilde on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \bar on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \breve on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \check on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \hat on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \dot on input line 14.
|
||||
LaTeX Font Info: Redeclaring math accent \mathring on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Delta on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Theta on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Xi on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Pi on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Phi on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Psi on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \Omega on input line 14.
|
||||
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 14.
|
||||
LaTeX Font Info: Redeclaring symbol font `operators' on input line 14.
|
||||
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
|
||||
(Font) `operators' in the math version `normal' on input line 14.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
|
||||
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 14.
|
||||
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
|
||||
(Font) `operators' in the math version `bold' on input line 14.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
|
||||
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 14.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
|
||||
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal'
|
||||
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal'
|
||||
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal'
|
||||
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal'
|
||||
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 14.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
|
||||
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold'
|
||||
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold'
|
||||
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 14.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold'
|
||||
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 14.
|
||||
Package hyperref Info: Link coloring OFF on input line 14.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/nameref.sty
|
||||
Package: nameref 2021-04-02 v2.47 Cross-referencing by name of section
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/refcount/refcount.sty
|
||||
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
|
||||
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
|
||||
)
|
||||
\c@section@level=\count313
|
||||
)
|
||||
LaTeX Info: Redefining \ref on input line 14.
|
||||
LaTeX Info: Redefining \pageref on input line 14.
|
||||
LaTeX Info: Redefining \nameref on input line 14.
|
||||
(./pseudocodes.out) (./pseudocodes.out)
|
||||
\@outlinefile=\write3
|
||||
\openout3 = `pseudocodes.out'.
|
||||
|
||||
(./pseudocodes.toc)
|
||||
\tf@toc=\write4
|
||||
\openout4 = `pseudocodes.toc'.
|
||||
|
||||
LaTeX Font Info: Font shape `TU/SongtiSCLight(0)/m/sl' in size <10.95> not available
|
||||
(Font) Font shape `TU/SongtiSCLight(0)/m/it' tried instead on input line 17.
|
||||
[1
|
||||
|
||||
]
|
||||
Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 22.
|
||||
[2
|
||||
|
||||
]
|
||||
LaTeX Font Info: Trying to load font information for U+msa on input line 32.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsa.fd
|
||||
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
|
||||
)
|
||||
LaTeX Font Info: Trying to load font information for U+msb on input line 32.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsb.fd
|
||||
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
|
||||
) [3
|
||||
|
||||
] [4
|
||||
|
||||
] [5
|
||||
|
||||
] [6
|
||||
|
||||
] [7
|
||||
|
||||
] [8
|
||||
|
||||
]
|
||||
Overfull \hbox (32.54117pt too wide) in paragraph at lines 212--212
|
||||
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^R\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 Q[] [] []$|
|
||||
[]
|
||||
|
||||
|
||||
Overfull \hbox (15.41673pt too wide) in paragraph at lines 213--213
|
||||
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^^\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 ^^K [] [] \OT1/cmr/m/n/9 + [] \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 f[] []$\TU/lmr/m/n/9 ,$[][] \OT1/cmr/m/n/9 =
|
||||
[]
|
||||
|
||||
[9
|
||||
|
||||
] (./pseudocodes.aux)
|
||||
Package rerunfilecheck Info: File `pseudocodes.out' has not changed.
|
||||
(rerunfilecheck) Checksum: 35B5A79A86EF3BC70F1A0B3BCBEBAA13;724.
|
||||
)
|
||||
Here is how much of TeX's memory you used:
|
||||
14827 strings out of 476919
|
||||
313456 string characters out of 5821840
|
||||
653576 words of memory out of 5000000
|
||||
34576 multiletter control sequences out of 15000+600000
|
||||
413609 words of font info for 91 fonts, out of 8000000 for 9000
|
||||
1348 hyphenation exceptions out of 8191
|
||||
101i,13n,104p,676b,697s stack positions out of 5000i,500n,10000p,200000b,80000s
|
||||
|
||||
Output written on pseudocodes.pdf (9 pages).
|
||||
@@ -1,8 +0,0 @@
|
||||
\BOOKMARK [1][-]{section.1}{\376\377\152\041\162\110\131\007\165\050}{}% 1
|
||||
\BOOKMARK [1][-]{section.2}{\376\377\000Q\000\040\000l\000e\000a\000r\000n\000i\000n\000g\173\227\154\325}{}% 2
|
||||
\BOOKMARK [1][-]{section.3}{\376\377\000S\000a\000r\000s\000a\173\227\154\325}{}% 3
|
||||
\BOOKMARK [1][-]{section.4}{\376\377\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\173\227\154\325}{}% 4
|
||||
\BOOKMARK [1][-]{section.5}{\376\377\000D\000Q\000N\173\227\154\325}{}% 5
|
||||
\BOOKMARK [1][-]{section.6}{\376\377\000S\000o\000f\000t\000Q\173\227\154\325}{}% 6
|
||||
\BOOKMARK [1][-]{section.7}{\376\377\000S\000A\000C\000-\000S\173\227\154\325}{}% 7
|
||||
\BOOKMARK [1][-]{section.8}{\376\377\000S\000A\000C\173\227\154\325}{}% 8
|
||||
@@ -11,6 +11,27 @@
|
||||
\usepackage{float} % 调用该包能够使用[H]
|
||||
% \pagestyle{plain} % 去除页眉,但是保留页脚编号,都去掉plain换empty
|
||||
|
||||
% 更改脚注为圆圈
|
||||
\usepackage{pifont}
|
||||
\makeatletter
|
||||
\newcommand*{\circnum}[1]{%
|
||||
\expandafter\@circnum\csname c@#1\endcsname
|
||||
}
|
||||
\newcommand*{\@circnum}[1]{%
|
||||
\ifnum#1<1 %
|
||||
\@ctrerr
|
||||
\else
|
||||
\ifnum#1>20 %
|
||||
\@ctrerr
|
||||
\else
|
||||
\ding{\the\numexpr 171+(#1)\relax}%
|
||||
\fi
|
||||
\fi
|
||||
}
|
||||
\makeatother
|
||||
|
||||
\renewcommand*{\thefootnote}{\circnum{footnote}}
|
||||
|
||||
\begin{document}
|
||||
\tableofcontents % 目录,注意要运行两下或者vscode保存两下才能显示
|
||||
% \singlespacing
|
||||
@@ -69,27 +90,10 @@
|
||||
\end{algorithm}
|
||||
\footnotetext[1]{Reinforcement Learning: An Introduction}
|
||||
\clearpage
|
||||
\section{Policy Gradient算法}
|
||||
\begin{algorithm}[H] % [H]固定位置
|
||||
\floatname{algorithm}{{REINFORCE算法:Monte-Carlo Policy Gradient}\footnotemark[1]}
|
||||
\renewcommand{\thealgorithm}{} % 去掉算法标号
|
||||
\caption{}
|
||||
\begin{algorithmic}[1] % [1]显示步数
|
||||
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
|
||||
\FOR {回合数 = $1,M$}
|
||||
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
|
||||
\FOR {时步 = $1,t$}
|
||||
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
|
||||
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
|
||||
\ENDFOR
|
||||
\ENDFOR
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
\footnotetext[1]{Reinforcement Learning: An Introduction}
|
||||
\clearpage
|
||||
|
||||
\section{DQN算法}
|
||||
\begin{algorithm}[H] % [H]固定位置
|
||||
\floatname{algorithm}{{DQN算法}{\hypersetup{linkcolor=white}\footnotemark}}
|
||||
\floatname{algorithm}{{DQN算法}\footnotemark[1]}
|
||||
\renewcommand{\thealgorithm}{} % 去掉算法标号
|
||||
\caption{}
|
||||
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
|
||||
@@ -109,10 +113,10 @@
|
||||
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
|
||||
\STATE {\bfseries 更新策略:}
|
||||
\STATE 从$D$中采样一个batch的transition
|
||||
\STATE 计算实际的$Q$值,即$y_{j}${\hypersetup{linkcolor=white}\footnotemark}
|
||||
\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降{\hypersetup{linkcolor=white}\footnotemark}
|
||||
\STATE 计算实际的$Q$值,即$y_{j}$\footnotemark[2]
|
||||
\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降\footnotemark[3]
|
||||
\ENDFOR
|
||||
\STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q${\hypersetup{linkcolor=white}\footnotemark}
|
||||
\STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]]
|
||||
\ENDFOR
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
@@ -121,7 +125,46 @@
|
||||
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
|
||||
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定}
|
||||
\clearpage
|
||||
\section{Policy Gradient算法}
|
||||
\begin{algorithm}[H] % [H]固定位置
|
||||
\floatname{algorithm}{{REINFORCE算法:Monte-Carlo Policy Gradient}\footnotemark[1]}
|
||||
\renewcommand{\thealgorithm}{} % 去掉算法标号
|
||||
\caption{}
|
||||
\begin{algorithmic}[1] % [1]显示步数
|
||||
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
|
||||
\FOR {回合数 = $1,M$}
|
||||
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
|
||||
\FOR {时步 = $1,t$}
|
||||
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
|
||||
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
|
||||
\ENDFOR
|
||||
\ENDFOR
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
\footnotetext[1]{Reinforcement Learning: An Introduction}
|
||||
\clearpage
|
||||
\section{Advantage Actor Critic算法}
|
||||
\begin{algorithm}[H] % [H]固定位置
|
||||
\floatname{algorithm}{{Q Actor Critic算法}}
|
||||
\renewcommand{\thealgorithm}{} % 去掉算法标号
|
||||
\caption{}
|
||||
\begin{algorithmic}[1] % [1]显示步数
|
||||
\STATE 初始化Actor参数$\theta$和Critic参数$w$
|
||||
\FOR {回合数 = $1,M$}
|
||||
\STATE 根据策略$\pi_{\theta}(a|s)$采样一个(或几个)回合的transition
|
||||
\STATE {\bfseries 更新Critic参数\footnotemark[1]}
|
||||
\FOR {时步 = $t+1,1$}
|
||||
\STATE 计算Advantage,即$ \delta_t = r_t + \gamma Q_w(s_{t+1},a_{t+1})-Q_w(s_t,a_t)$
|
||||
\STATE $w \leftarrow w+\alpha_{w} \delta_{t} \nabla_{w} Q_w(s_t,a_t)$
|
||||
\STATE $a_t \leftarrow a_{t+1}$,$s_t \leftarrow s_{t+1}$
|
||||
\ENDFOR
|
||||
\STATE 更新Actor参数$\theta \leftarrow \theta+\alpha_{\theta} Q_{w}(s, a) \nabla_{\theta} \log \pi_{\theta}(a \mid s)$
|
||||
\ENDFOR
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
\footnotetext[1]{这里结合TD error的特性按照从$t+1$到$1$计算法Advantage更方便}
|
||||
|
||||
\clearpage
|
||||
\section{SoftQ算法}
|
||||
\begin{algorithm}[H]
|
||||
\floatname{algorithm}{{SoftQ算法}}
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
\contentsline {section}{\numberline {1}模版备用}{2}{section.1}%
|
||||
\contentsline {section}{\numberline {2}Q learning算法}{3}{section.2}%
|
||||
\contentsline {section}{\numberline {3}Sarsa算法}{4}{section.3}%
|
||||
\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}%
|
||||
\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}%
|
||||
\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}%
|
||||
\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}%
|
||||
\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}%
|
||||
@@ -1,56 +1,60 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-05-03 22:16:08
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-07-20 23:54:40
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.distributions import Categorical
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ActorCritic(nn.Module):
|
||||
''' A2C网络模型,包含一个Actor和Critic
|
||||
'''
|
||||
def __init__(self, input_dim, output_dim, hidden_dim):
|
||||
super(ActorCritic, self).__init__()
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(input_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, 1)
|
||||
)
|
||||
|
||||
self.actor = nn.Sequential(
|
||||
nn.Linear(input_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, output_dim),
|
||||
nn.Softmax(dim=1),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
value = self.critic(x)
|
||||
probs = self.actor(x)
|
||||
dist = Categorical(probs)
|
||||
return dist, value
|
||||
class A2C:
|
||||
''' A2C算法
|
||||
'''
|
||||
def __init__(self,n_states,n_actions,cfg) -> None:
|
||||
self.gamma = cfg.gamma
|
||||
self.device = torch.device(cfg.device)
|
||||
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
|
||||
self.optimizer = optim.Adam(self.model.parameters())
|
||||
def __init__(self,models,memories,cfg):
|
||||
self.n_actions = cfg['n_actions']
|
||||
self.gamma = cfg['gamma']
|
||||
self.device = torch.device(cfg['device'])
|
||||
self.memory = memories['ACMemory']
|
||||
self.actor = models['Actor'].to(self.device)
|
||||
self.critic = models['Critic'].to(self.device)
|
||||
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=cfg['actor_lr'])
|
||||
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=cfg['critic_lr'])
|
||||
def sample_action(self,state):
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
dist = self.actor(state)
|
||||
value = self.critic(state) # note that 'dist' need require_grad=True
|
||||
value = value.detach().numpy().squeeze(0)[0]
|
||||
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
|
||||
return action,value,dist
|
||||
def predict_action(self,state):
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
dist = self.actor(state)
|
||||
value = self.critic(state) # note that 'dist' need require_grad=True
|
||||
value = value.detach().numpy().squeeze(0)[0]
|
||||
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
|
||||
return action,value,dist
|
||||
def update(self,next_state,entropy):
|
||||
value_pool,log_prob_pool,reward_pool = self.memory.sample()
|
||||
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
next_value = self.critic(next_state)
|
||||
returns = np.zeros_like(reward_pool)
|
||||
for t in reversed(range(len(reward_pool))):
|
||||
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
|
||||
returns[t] = next_value
|
||||
returns = torch.tensor(returns, device=self.device)
|
||||
value_pool = torch.tensor(value_pool, device=self.device)
|
||||
advantages = returns - value_pool
|
||||
log_prob_pool = torch.stack(log_prob_pool)
|
||||
actor_loss = (-log_prob_pool * advantages).mean()
|
||||
critic_loss = 0.5 * advantages.pow(2).mean()
|
||||
tot_loss = actor_loss + critic_loss + 0.001 * entropy
|
||||
self.actor_optim.zero_grad()
|
||||
self.critic_optim.zero_grad()
|
||||
tot_loss.backward()
|
||||
self.actor_optim.step()
|
||||
self.critic_optim.step()
|
||||
self.memory.clear()
|
||||
def save_model(self, path):
|
||||
from pathlib import Path
|
||||
# create path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.actor.state_dict(), f"{path}/actor_checkpoint.pt")
|
||||
torch.save(self.critic.state_dict(), f"{path}/critic_checkpoint.pt")
|
||||
|
||||
def compute_returns(self,next_value, rewards, masks):
|
||||
R = next_value
|
||||
returns = []
|
||||
for step in reversed(range(len(rewards))):
|
||||
R = rewards[step] + self.gamma * R * masks[step]
|
||||
returns.insert(0, R)
|
||||
return returns
|
||||
def load_model(self, path):
|
||||
self.actor.load_state_dict(torch.load(f"{path}/actor_checkpoint.pt"))
|
||||
self.critic.load_state_dict(torch.load(f"{path}/critic_checkpoint.pt"))
|
||||
55
projects/codes/A2C/a2c_2.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
class A2C_2:
|
||||
def __init__(self,models,memories,cfg):
|
||||
self.n_actions = cfg['n_actions']
|
||||
self.gamma = cfg['gamma']
|
||||
self.device = torch.device(cfg['device'])
|
||||
self.memory = memories['ACMemory']
|
||||
self.ac_net = models['ActorCritic'].to(self.device)
|
||||
self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr=cfg['lr'])
|
||||
def sample_action(self,state):
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
value, dist = self.ac_net(state) # note that 'dist' need require_grad=True
|
||||
value = value.detach().numpy().squeeze(0)[0]
|
||||
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
|
||||
return action,value,dist
|
||||
def predict_action(self,state):
|
||||
''' predict can be all wrapped with no_grad(), then donot need detach(), or you can just copy contents of 'sample_action'
|
||||
'''
|
||||
with torch.no_grad():
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
value, dist = self.ac_net(state)
|
||||
value = value.numpy().squeeze(0)[0] # shape(value) = (1,)
|
||||
action = np.random.choice(self.n_actions, p=dist.numpy().squeeze(0)) # shape(p=(n_actions,1)
|
||||
return action,value,dist
|
||||
def update(self,next_state,entropy):
|
||||
value_pool,log_prob_pool,reward_pool = self.memory.sample()
|
||||
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
next_value,_ = self.ac_net(next_state)
|
||||
returns = np.zeros_like(reward_pool)
|
||||
for t in reversed(range(len(reward_pool))):
|
||||
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
|
||||
returns[t] = next_value
|
||||
returns = torch.tensor(returns, device=self.device)
|
||||
value_pool = torch.tensor(value_pool, device=self.device)
|
||||
advantages = returns - value_pool
|
||||
log_prob_pool = torch.stack(log_prob_pool)
|
||||
actor_loss = (-log_prob_pool * advantages).mean()
|
||||
critic_loss = 0.5 * advantages.pow(2).mean()
|
||||
ac_loss = actor_loss + critic_loss + 0.001 * entropy
|
||||
self.ac_optimizer.zero_grad()
|
||||
ac_loss.backward()
|
||||
self.ac_optimizer.step()
|
||||
self.memory.clear()
|
||||
def save_model(self, path):
|
||||
from pathlib import Path
|
||||
# create path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.ac_net.state_dict(), f"{path}/a2c_checkpoint.pt")
|
||||
|
||||
def load_model(self, path):
|
||||
self.ac_net.load_state_dict(torch.load(f"{path}/a2c_checkpoint.pt"))
|
||||
|
||||
|
||||
121
projects/codes/A2C/main.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import sys,os
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add path to system path
|
||||
|
||||
import datetime
|
||||
import argparse
|
||||
import gym
|
||||
import torch
|
||||
import numpy as np
|
||||
from common.utils import all_seed
|
||||
from common.launcher import Launcher
|
||||
from common.memories import PGReplay
|
||||
from common.models import ActorSoftmax,Critic
|
||||
from envs.register import register_env
|
||||
from a2c import A2C
|
||||
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=1600,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
|
||||
parser.add_argument('--actor_lr',default=3e-4,type=float,help="learning rate of actor")
|
||||
parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic")
|
||||
parser.add_argument('--actor_hidden_dim',default=256,type=int,help="hidden of actor net")
|
||||
parser.add_argument('--critic_hidden_dim',default=256,type=int,help="hidden of critic net")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(self,cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
models = {'Actor':ActorSoftmax(cfg['n_states'],cfg['n_actions'], hidden_dim = cfg['actor_hidden_dim']),'Critic':Critic(cfg['n_states'],1,hidden_dim=cfg['critic_hidden_dim'])}
|
||||
memories = {'ACMemory':PGReplay()}
|
||||
agent = A2C(models,memories,cfg)
|
||||
return env,agent
|
||||
def train(self,cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
|
||||
for i_ep in range(cfg['train_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0 # step per episode
|
||||
ep_entropy = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action, value, dist = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
log_prob = torch.log(dist.squeeze(0)[action])
|
||||
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
|
||||
agent.memory.push((value,log_prob,reward)) # save transitions
|
||||
state = next_state # update state
|
||||
ep_reward += reward
|
||||
ep_entropy += entropy
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
agent.update(next_state,ep_entropy) # update agent
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
if (i_ep+1)%10==0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
|
||||
print("Finish training!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
def test(self,cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action,_,_ = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
if __name__ == "__main__":
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
|
||||
|
||||
|
||||
120
projects/codes/A2C/main2.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import sys,os
|
||||
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add path to system path
|
||||
|
||||
import datetime
|
||||
import argparse
|
||||
import gym
|
||||
import torch
|
||||
import numpy as np
|
||||
from common.utils import all_seed
|
||||
from common.launcher import Launcher
|
||||
from common.memories import PGReplay
|
||||
from common.models import ActorCriticSoftmax
|
||||
from envs.register import register_env
|
||||
from a2c_2 import A2C_2
|
||||
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=2000,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
|
||||
parser.add_argument('--lr',default=3e-4,type=float,help="learning rate")
|
||||
parser.add_argument('--actor_hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--critic_hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(self,cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
models = {'ActorCritic':ActorCriticSoftmax(cfg['n_states'],cfg['n_actions'], actor_hidden_dim = cfg['actor_hidden_dim'],critic_hidden_dim=cfg['critic_hidden_dim'])}
|
||||
memories = {'ACMemory':PGReplay()}
|
||||
agent = A2C_2(models,memories,cfg)
|
||||
return env,agent
|
||||
def train(self,cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
|
||||
for i_ep in range(cfg['train_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0 # step per episode
|
||||
ep_entropy = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action, value, dist = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
log_prob = torch.log(dist.squeeze(0)[action])
|
||||
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
|
||||
agent.memory.push((value,log_prob,reward)) # save transitions
|
||||
state = next_state # update state
|
||||
ep_reward += reward
|
||||
ep_entropy += entropy
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
agent.update(next_state,ep_entropy) # update agent
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
if (i_ep+1)%10==0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
|
||||
print("Finish training!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
def test(self,cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action,_,_ = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
if __name__ == "__main__":
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"algo_name": "A2C",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 2000,
|
||||
"test_eps": 20,
|
||||
"ep_max_steps": 100000,
|
||||
"gamma": 0.99,
|
||||
"lr": 0.0003,
|
||||
"actor_hidden_dim": 256,
|
||||
"critic_hidden_dim": 256,
|
||||
"device": "cpu",
|
||||
"seed": 10,
|
||||
"show_fig": false,
|
||||
"save_fig": true,
|
||||
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/results/",
|
||||
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/models/",
|
||||
"n_states": 4,
|
||||
"n_actions": 2
|
||||
}
|
||||
|
After Width: | Height: | Size: 44 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,200.0,200
|
||||
1,200.0,200
|
||||
2,93.0,93
|
||||
3,155.0,155
|
||||
4,116.0,116
|
||||
5,200.0,200
|
||||
6,190.0,190
|
||||
7,176.0,176
|
||||
8,200.0,200
|
||||
9,200.0,200
|
||||
10,200.0,200
|
||||
11,179.0,179
|
||||
12,200.0,200
|
||||
13,185.0,185
|
||||
14,191.0,191
|
||||
15,200.0,200
|
||||
16,200.0,200
|
||||
17,124.0,124
|
||||
18,200.0,200
|
||||
19,172.0,172
|
||||
|
|
After Width: | Height: | Size: 63 KiB |
@@ -0,0 +1 @@
|
||||
{"algo_name": "A2C", "env_name": "CartPole-v0", "train_eps": 1600, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "actor_lr": 0.0003, "critic_lr": 0.001, "actor_hidden_dim": 256, "critic_hidden_dim": 256, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/models/", "n_states": 4, "n_actions": 2}
|
||||
|
After Width: | Height: | Size: 41 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,177.0,177
|
||||
1,180.0,180
|
||||
2,200.0,200
|
||||
3,200.0,200
|
||||
4,167.0,167
|
||||
5,124.0,124
|
||||
6,128.0,128
|
||||
7,200.0,200
|
||||
8,200.0,200
|
||||
9,200.0,200
|
||||
10,186.0,186
|
||||
11,187.0,187
|
||||
12,200.0,200
|
||||
13,176.0,176
|
||||
14,200.0,200
|
||||
15,200.0,200
|
||||
16,200.0,200
|
||||
17,200.0,200
|
||||
18,185.0,185
|
||||
19,180.0,180
|
||||
|
|
After Width: | Height: | Size: 66 KiB |
56
projects/codes/A3C/a3c.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-05-03 22:16:08
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-07-20 23:54:40
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.distributions import Categorical
|
||||
|
||||
class ActorCritic(nn.Module):
|
||||
''' A2C网络模型,包含一个Actor和Critic
|
||||
'''
|
||||
def __init__(self, input_dim, output_dim, hidden_dim):
|
||||
super(ActorCritic, self).__init__()
|
||||
self.critic = nn.Sequential(
|
||||
nn.Linear(input_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, 1)
|
||||
)
|
||||
|
||||
self.actor = nn.Sequential(
|
||||
nn.Linear(input_dim, hidden_dim),
|
||||
nn.ReLU(),
|
||||
nn.Linear(hidden_dim, output_dim),
|
||||
nn.Softmax(dim=1),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
value = self.critic(x)
|
||||
probs = self.actor(x)
|
||||
dist = Categorical(probs)
|
||||
return dist, value
|
||||
class A2C:
|
||||
''' A2C算法
|
||||
'''
|
||||
def __init__(self,n_states,n_actions,cfg) -> None:
|
||||
self.gamma = cfg.gamma
|
||||
self.device = torch.device(cfg.device)
|
||||
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
|
||||
self.optimizer = optim.Adam(self.model.parameters())
|
||||
|
||||
def compute_returns(self,next_value, rewards, masks):
|
||||
R = next_value
|
||||
returns = []
|
||||
for step in reversed(range(len(rewards))):
|
||||
R = rewards[step] + self.gamma * R * masks[step]
|
||||
returns.insert(0, R)
|
||||
return returns
|
||||
|
Before Width: | Height: | Size: 64 KiB After Width: | Height: | Size: 64 KiB |
@@ -10,7 +10,7 @@ import torch.optim as optim
|
||||
import datetime
|
||||
import argparse
|
||||
from common.multiprocessing_env import SubprocVecEnv
|
||||
from a2c import ActorCritic
|
||||
from a3c import ActorCritic
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards, save_args
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2022-08-23 23:59:54
|
||||
LastEditTime: 2022-08-29 23:30:08
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -78,7 +78,7 @@ class DQN:
|
||||
self.batch_size)
|
||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
# print(state_batch.shape,action_batch.shape,reward_batch.shape,next_state_batch.shape,done_batch.shape)
|
||||
@@ -91,7 +91,7 @@ class DQN:
|
||||
# compute expected q value, for terminal state, done_batch[0]=1, and expected_q_value=rewardcorrespondingly
|
||||
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
|
||||
# print(expected_q_value_batch.shape,expected_q_value_batch.requires_grad)
|
||||
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
||||
loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
||||
# backpropagation
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
@@ -9,129 +9,122 @@ import torch
|
||||
import datetime
|
||||
import numpy as np
|
||||
import argparse
|
||||
from common.utils import save_results,all_seed
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.utils import all_seed
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer
|
||||
from common.launcher import Launcher
|
||||
from envs.register import register_env
|
||||
from dqn import DQN
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
""" hyperparameters
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
# please manually change the following args in this script if you want
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models' )
|
||||
args = parser.parse_args()
|
||||
args = {**vars(args)} # type(dict)
|
||||
return args
|
||||
|
||||
def get_args():
|
||||
""" hyperparameters
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon, the higher value, the slower decay")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
# please manually change the following args in this script if you want
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models' )
|
||||
args = parser.parse_args()
|
||||
args = {**vars(args)} # type(dict)
|
||||
return args
|
||||
def env_agent_config(cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
|
||||
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
|
||||
agent = DQN(model,memory,cfg) # create agent
|
||||
return env, agent
|
||||
|
||||
def env_agent_config(cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
env = gym.make(cfg['env_name']) # create env
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
n_states = env.observation_space.shape[0] # state dimension
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
model = MLP(n_states,n_actions,hidden_dim=cfg["hidden_dim"])
|
||||
memory = ReplayBuffer(cfg["memory_capacity"]) # replay buffer
|
||||
agent = DQN(model,memory,cfg) # create agent
|
||||
return env, agent
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg["train_eps"]):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
ep_step += 1
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # save transitions
|
||||
state = next_state # update next state for env
|
||||
agent.update() # update agent
|
||||
ep_reward += reward #
|
||||
if done:
|
||||
break
|
||||
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 10 == 0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
env.close()
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' 训练
|
||||
'''
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg["train_eps"]):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
while True:
|
||||
ep_step += 1
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # save transitions
|
||||
state = next_state # update next state for env
|
||||
agent.update() # update agent
|
||||
ep_reward += reward #
|
||||
if done:
|
||||
break
|
||||
if (i_ep + 1) % cfg["target_update"] == 0: # target net update, target_update means "C" in pseucodes
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 10 == 0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
env.close()
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
return res_dic
|
||||
|
||||
def test(cfg, env, agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
while True:
|
||||
ep_step+=1
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
def test(cfg, env, agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
ep_step+=1
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# training
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
save_args(cfg,path = cfg['result_path']) # save parameters
|
||||
agent.save_model(path = cfg['model_path']) # save models
|
||||
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
|
||||
# testing
|
||||
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
|
||||
agent.load_model(path = cfg['model_path']) # load model
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path = cfg['result_path'])
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
@@ -1 +1,21 @@
|
||||
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 10, "result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results", "model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models", "show_fig": false, "save_fig": true}
|
||||
{
|
||||
"algo_name": "DQN",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 200,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.95,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 500,
|
||||
"lr": 0.0001,
|
||||
"memory_capacity": 100000,
|
||||
"batch_size": 64,
|
||||
"target_update": 4,
|
||||
"hidden_dim": 256,
|
||||
"device": "cpu",
|
||||
"seed": 10,
|
||||
"result_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/results",
|
||||
"model_path": "C:\\Users\\jiangji\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v0/20220823-173936/models",
|
||||
"show_fig": false,
|
||||
"save_fig": true
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"algo_name": "DQN",
|
||||
"env_name": "CartPole-v1",
|
||||
"train_eps": 2000,
|
||||
"test_eps": 20,
|
||||
"ep_max_steps": 100000,
|
||||
"gamma": 0.99,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 6000,
|
||||
"lr": 1e-05,
|
||||
"memory_capacity": 200000,
|
||||
"batch_size": 64,
|
||||
"target_update": 4,
|
||||
"hidden_dim": 256,
|
||||
"device": "cuda",
|
||||
"seed": 10,
|
||||
"show_fig": false,
|
||||
"save_fig": true,
|
||||
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results",
|
||||
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models",
|
||||
"n_states": 4,
|
||||
"n_actions": 2
|
||||
}
|
||||
|
After Width: | Height: | Size: 50 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,371.0,371
|
||||
1,446.0,446
|
||||
2,300.0,300
|
||||
3,500.0,500
|
||||
4,313.0,313
|
||||
5,500.0,500
|
||||
6,341.0,341
|
||||
7,489.0,489
|
||||
8,304.0,304
|
||||
9,358.0,358
|
||||
10,278.0,278
|
||||
11,500.0,500
|
||||
12,500.0,500
|
||||
13,500.0,500
|
||||
14,500.0,500
|
||||
15,476.0,476
|
||||
16,308.0,308
|
||||
17,394.0,394
|
||||
18,500.0,500
|
||||
19,500.0,500
|
||||
|
|
After Width: | Height: | Size: 50 KiB |
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2022-07-21 00:08:26
|
||||
LastEditTime: 2022-08-29 23:34:20
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -20,148 +20,87 @@ import torch.nn.functional as F
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
n_states: 输入的特征数即环境的状态维度
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
|
||||
class DoubleDQN:
|
||||
def __init__(self, n_states, n_actions, model, memory, cfg):
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = torch.device(cfg.device) # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma
|
||||
# e-greedy策略相关参数
|
||||
self.sample_count = 0
|
||||
self.epsilon_start = cfg.epsilon_start
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = model.to(self.device)
|
||||
self.target_net = model.to(self.device)
|
||||
def __init__(self,models, memories, cfg):
|
||||
self.n_actions = cfg['n_actions']
|
||||
self.device = torch.device(cfg['device'])
|
||||
self.gamma = cfg['gamma']
|
||||
## e-greedy parameters
|
||||
self.sample_count = 0 # sample count for epsilon decay
|
||||
self.epsilon_start = cfg['epsilon_start']
|
||||
self.epsilon_end = cfg['epsilon_end']
|
||||
self.epsilon_decay = cfg['epsilon_decay']
|
||||
self.batch_size = cfg['batch_size']
|
||||
self.policy_net = models['Qnet'].to(self.device)
|
||||
self.target_net = models['Qnet'].to(self.device)
|
||||
# target_net copy from policy_net
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
target_param.data.copy_(param.data)
|
||||
# self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
|
||||
# 可查parameters()与state_dict()的区别,前者require_grad=True
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.loss = 0
|
||||
self.memory = memory
|
||||
# self.target_net.eval() # donnot use BatchNormalization or Dropout
|
||||
# the difference between parameters() and state_dict() is that parameters() require_grad=True
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg['lr'])
|
||||
self.memory = memories['Memory']
|
||||
self.update_flag = False
|
||||
|
||||
def sample(self, state):
|
||||
'''选择动作
|
||||
def sample_action(self, state):
|
||||
''' sample action
|
||||
'''
|
||||
self.sample_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||
if random.random() > self.epsilon:
|
||||
with torch.no_grad():
|
||||
# 先转为张量便于丢给神经网络,state元素数据原本为float64
|
||||
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
|
||||
state = torch.tensor(
|
||||
[state], device=self.device, dtype=torch.float32)
|
||||
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
|
||||
q_value = self.policy_net(state)
|
||||
# tensor.max(1)返回每行的最大值以及对应的下标,
|
||||
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
|
||||
# 所以tensor.max(1)[1]返回最大值对应的下标,即action
|
||||
action = q_value.max(1)[1].item()
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def predict(self, state):
|
||||
'''选择动作
|
||||
def predict_action(self, state):
|
||||
''' predict action
|
||||
'''
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
|
||||
q_value = self.policy_net(state)
|
||||
action = q_value.max(1)[1].item()
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
|
||||
if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
|
||||
return
|
||||
# 从memory中随机采样transition
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
else:
|
||||
if not self.update_flag:
|
||||
print("Begin to update!")
|
||||
self.update_flag = True
|
||||
# sample a batch of transitions from replay buffer
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
|
||||
# convert to tensor
|
||||
state_batch = torch.tensor(
|
||||
state_batch, device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
|
||||
1) # 例如tensor([[1],...,[0]])
|
||||
reward_batch = torch.tensor(
|
||||
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
|
||||
next_state_batch = torch.tensor(
|
||||
next_state_batch, device=self.device, dtype=torch.float)
|
||||
|
||||
done_batch = torch.tensor(np.float32(
|
||||
done_batch), device=self.device) # 将bool转为float然后转为张量
|
||||
# 计算当前(s_t,a)对应的Q(s_t, a)
|
||||
q_values = self.policy_net(state_batch)
|
||||
next_q_values = self.policy_net(next_state_batch)
|
||||
# 代入当前选择的action,得到Q(s_t|a=a_t)
|
||||
q_value = q_values.gather(dim=1, index=action_batch)
|
||||
'''以下是Nature DQN的q_target计算方式
|
||||
# 计算所有next states的Q'(s_{t+1})的最大值,Q'为目标网络的q函数
|
||||
next_q_state_value = self.target_net(
|
||||
next_state_batch).max(1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,])
|
||||
# 计算 q_target
|
||||
# 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
|
||||
'''
|
||||
'''以下是Double DQN q_target计算方式,与NatureDQN稍有不同'''
|
||||
next_target_values = self.target_net(
|
||||
next_state_batch)
|
||||
# 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a))
|
||||
next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
|
||||
q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
|
||||
self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1)) # 计算 均方误差loss
|
||||
# 优化模型
|
||||
self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step
|
||||
# loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
|
||||
self.loss.backward()
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
# compute current Q(s_t|a=a_t)
|
||||
q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
|
||||
next_q_value_batch = self.policy_net(next_state_batch)
|
||||
'''the following is the way of computing Double DQN expected_q_value,a bit different from Nature DQN'''
|
||||
next_target_value_batch = self.target_net(next_state_batch)
|
||||
# choose action a from Q(s_t‘, a), next_target_values obtain next_q_value,which is Q’(s_t|a=argmax Q(s_t‘, a))
|
||||
next_target_q_value_batch = next_target_value_batch.gather(1, torch.max(next_q_value_batch, 1)[1].unsqueeze(1)) # shape(batchsize,1)
|
||||
expected_q_value_batch = reward_batch + self.gamma * next_target_q_value_batch * (1-done_batch)
|
||||
loss = nn.MSELoss()(q_value_batch , expected_q_value_batch)
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
# clip to avoid gradient explosion
|
||||
for param in self.policy_net.parameters():
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step() # 更新模型
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self,path):
|
||||
def save_model(self,path):
|
||||
from pathlib import Path
|
||||
# create path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
|
||||
|
||||
def load(self,path):
|
||||
def load_model(self,path):
|
||||
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
|
||||
129
projects/codes/DoubleDQN/main.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-11-07 18:10:37
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-08-29 23:33:31
|
||||
Discription:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
|
||||
import gym
|
||||
import datetime
|
||||
import argparse
|
||||
|
||||
from common.utils import all_seed
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBufferQue
|
||||
from DoubleDQN.double_dqn import DoubleDQN
|
||||
from common.launcher import Launcher
|
||||
from envs.register import register_env
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
''' hyperparameters
|
||||
'''
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=1,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(self,cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
models = {'Qnet':MLP(n_states,n_actions,hidden_dim=cfg['hidden_dim'])}
|
||||
memories = {'Memory':ReplayBufferQue(cfg['memory_capacity'])}
|
||||
agent = DoubleDQN(models,memories,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(self,cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg["train_eps"]):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action = agent.sample_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
agent.memory.push((state, action, reward, next_state, done))
|
||||
state = next_state
|
||||
agent.update()
|
||||
if done:
|
||||
break
|
||||
if i_ep % cfg['target_update'] == 0:
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep+1)%10 == 0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
env.close()
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
return res_dic
|
||||
|
||||
def test(self,cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action = agent.predict_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
if __name__ == "__main__":
|
||||
main = Main()
|
||||
main.run()
|
||||
@@ -1 +0,0 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}
|
||||
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 43 KiB |
@@ -0,0 +1 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233435/models/", "n_states": 4, "n_actions": 2}
|
||||
|
After Width: | Height: | Size: 53 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,145.0,0
|
||||
1,166.0,0
|
||||
2,171.0,0
|
||||
3,200.0,0
|
||||
4,139.0,0
|
||||
5,200.0,0
|
||||
6,200.0,0
|
||||
7,141.0,0
|
||||
8,200.0,0
|
||||
9,187.0,0
|
||||
10,166.0,0
|
||||
11,172.0,0
|
||||
12,121.0,0
|
||||
13,200.0,0
|
||||
14,200.0,0
|
||||
15,149.0,0
|
||||
16,128.0,0
|
||||
17,200.0,0
|
||||
18,178.0,0
|
||||
19,185.0,0
|
||||
|
|
After Width: | Height: | Size: 65 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards,steps
|
||||
0,19.0,0
|
||||
1,16.0,0
|
||||
2,17.0,0
|
||||
3,11.0,0
|
||||
4,10.0,0
|
||||
5,27.0,0
|
||||
6,16.0,0
|
||||
7,9.0,0
|
||||
8,20.0,0
|
||||
9,21.0,0
|
||||
10,15.0,0
|
||||
11,10.0,0
|
||||
12,14.0,0
|
||||
13,37.0,0
|
||||
14,12.0,0
|
||||
15,10.0,0
|
||||
16,27.0,0
|
||||
17,33.0,0
|
||||
18,19.0,0
|
||||
19,13.0,0
|
||||
20,26.0,0
|
||||
21,15.0,0
|
||||
22,29.0,0
|
||||
23,11.0,0
|
||||
24,20.0,0
|
||||
25,23.0,0
|
||||
26,23.0,0
|
||||
27,26.0,0
|
||||
28,17.0,0
|
||||
29,33.0,0
|
||||
30,16.0,0
|
||||
31,48.0,0
|
||||
32,48.0,0
|
||||
33,69.0,0
|
||||
34,58.0,0
|
||||
35,24.0,0
|
||||
36,18.0,0
|
||||
37,28.0,0
|
||||
38,12.0,0
|
||||
39,12.0,0
|
||||
40,18.0,0
|
||||
41,12.0,0
|
||||
42,13.0,0
|
||||
43,21.0,0
|
||||
44,30.0,0
|
||||
45,32.0,0
|
||||
46,22.0,0
|
||||
47,18.0,0
|
||||
48,12.0,0
|
||||
49,12.0,0
|
||||
50,20.0,0
|
||||
51,32.0,0
|
||||
52,15.0,0
|
||||
53,100.0,0
|
||||
54,26.0,0
|
||||
55,25.0,0
|
||||
56,18.0,0
|
||||
57,15.0,0
|
||||
58,35.0,0
|
||||
59,12.0,0
|
||||
60,65.0,0
|
||||
61,27.0,0
|
||||
62,29.0,0
|
||||
63,22.0,0
|
||||
64,83.0,0
|
||||
65,24.0,0
|
||||
66,28.0,0
|
||||
67,15.0,0
|
||||
68,43.0,0
|
||||
69,13.0,0
|
||||
70,22.0,0
|
||||
71,46.0,0
|
||||
72,14.0,0
|
||||
73,32.0,0
|
||||
74,44.0,0
|
||||
75,53.0,0
|
||||
76,31.0,0
|
||||
77,51.0,0
|
||||
78,61.0,0
|
||||
79,30.0,0
|
||||
80,36.0,0
|
||||
81,30.0,0
|
||||
82,48.0,0
|
||||
83,26.0,0
|
||||
84,27.0,0
|
||||
85,43.0,0
|
||||
86,20.0,0
|
||||
87,87.0,0
|
||||
88,71.0,0
|
||||
89,43.0,0
|
||||
90,57.0,0
|
||||
91,40.0,0
|
||||
92,37.0,0
|
||||
93,43.0,0
|
||||
94,31.0,0
|
||||
95,45.0,0
|
||||
96,47.0,0
|
||||
97,52.0,0
|
||||
98,48.0,0
|
||||
99,98.0,0
|
||||
100,49.0,0
|
||||
101,98.0,0
|
||||
102,68.0,0
|
||||
103,70.0,0
|
||||
104,74.0,0
|
||||
105,73.0,0
|
||||
106,127.0,0
|
||||
107,92.0,0
|
||||
108,70.0,0
|
||||
109,97.0,0
|
||||
110,66.0,0
|
||||
111,112.0,0
|
||||
112,138.0,0
|
||||
113,81.0,0
|
||||
114,74.0,0
|
||||
115,153.0,0
|
||||
116,113.0,0
|
||||
117,88.0,0
|
||||
118,138.0,0
|
||||
119,200.0,0
|
||||
120,84.0,0
|
||||
121,123.0,0
|
||||
122,158.0,0
|
||||
123,171.0,0
|
||||
124,137.0,0
|
||||
125,143.0,0
|
||||
126,170.0,0
|
||||
127,127.0,0
|
||||
128,118.0,0
|
||||
129,200.0,0
|
||||
130,189.0,0
|
||||
131,149.0,0
|
||||
132,137.0,0
|
||||
133,115.0,0
|
||||
134,153.0,0
|
||||
135,136.0,0
|
||||
136,140.0,0
|
||||
137,169.0,0
|
||||
138,187.0,0
|
||||
139,200.0,0
|
||||
140,196.0,0
|
||||
141,200.0,0
|
||||
142,200.0,0
|
||||
143,137.0,0
|
||||
144,200.0,0
|
||||
145,185.0,0
|
||||
146,200.0,0
|
||||
147,164.0,0
|
||||
148,200.0,0
|
||||
149,143.0,0
|
||||
150,143.0,0
|
||||
151,112.0,0
|
||||
152,192.0,0
|
||||
153,200.0,0
|
||||
154,144.0,0
|
||||
155,188.0,0
|
||||
156,200.0,0
|
||||
157,133.0,0
|
||||
158,200.0,0
|
||||
159,143.0,0
|
||||
160,158.0,0
|
||||
161,161.0,0
|
||||
162,169.0,0
|
||||
163,176.0,0
|
||||
164,200.0,0
|
||||
165,149.0,0
|
||||
166,156.0,0
|
||||
167,200.0,0
|
||||
168,200.0,0
|
||||
169,200.0,0
|
||||
170,134.0,0
|
||||
171,171.0,0
|
||||
172,200.0,0
|
||||
173,200.0,0
|
||||
174,200.0,0
|
||||
175,194.0,0
|
||||
176,200.0,0
|
||||
177,138.0,0
|
||||
178,159.0,0
|
||||
179,187.0,0
|
||||
180,200.0,0
|
||||
181,192.0,0
|
||||
182,200.0,0
|
||||
183,200.0,0
|
||||
184,200.0,0
|
||||
185,173.0,0
|
||||
186,200.0,0
|
||||
187,178.0,0
|
||||
188,176.0,0
|
||||
189,196.0,0
|
||||
190,200.0,0
|
||||
191,195.0,0
|
||||
192,158.0,0
|
||||
193,156.0,0
|
||||
194,200.0,0
|
||||
195,200.0,0
|
||||
196,200.0,0
|
||||
197,200.0,0
|
||||
198,193.0,0
|
||||
199,200.0,0
|
||||
|
@@ -0,0 +1 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 1, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220829-233635/models/", "n_states": 4, "n_actions": 2}
|
||||
|
After Width: | Height: | Size: 40 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,200.0,0
|
||||
1,200.0,0
|
||||
2,200.0,0
|
||||
3,200.0,0
|
||||
4,191.0,0
|
||||
5,200.0,0
|
||||
6,200.0,0
|
||||
7,179.0,0
|
||||
8,200.0,0
|
||||
9,200.0,0
|
||||
10,200.0,0
|
||||
11,190.0,0
|
||||
12,147.0,0
|
||||
13,197.0,0
|
||||
14,200.0,0
|
||||
15,200.0,0
|
||||
16,167.0,0
|
||||
17,200.0,0
|
||||
18,200.0,0
|
||||
19,200.0,0
|
||||
|
|
After Width: | Height: | Size: 65 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards,steps
|
||||
0,19.0,0
|
||||
1,16.0,0
|
||||
2,17.0,0
|
||||
3,11.0,0
|
||||
4,10.0,0
|
||||
5,27.0,0
|
||||
6,55.0,0
|
||||
7,17.0,0
|
||||
8,23.0,0
|
||||
9,9.0,0
|
||||
10,17.0,0
|
||||
11,14.0,0
|
||||
12,17.0,0
|
||||
13,12.0,0
|
||||
14,14.0,0
|
||||
15,16.0,0
|
||||
16,27.0,0
|
||||
17,36.0,0
|
||||
18,17.0,0
|
||||
19,17.0,0
|
||||
20,21.0,0
|
||||
21,23.0,0
|
||||
22,13.0,0
|
||||
23,12.0,0
|
||||
24,17.0,0
|
||||
25,26.0,0
|
||||
26,25.0,0
|
||||
27,17.0,0
|
||||
28,10.0,0
|
||||
29,16.0,0
|
||||
30,14.0,0
|
||||
31,19.0,0
|
||||
32,23.0,0
|
||||
33,37.0,0
|
||||
34,29.0,0
|
||||
35,22.0,0
|
||||
36,29.0,0
|
||||
37,15.0,0
|
||||
38,16.0,0
|
||||
39,18.0,0
|
||||
40,23.0,0
|
||||
41,16.0,0
|
||||
42,26.0,0
|
||||
43,13.0,0
|
||||
44,24.0,0
|
||||
45,39.0,0
|
||||
46,23.0,0
|
||||
47,32.0,0
|
||||
48,123.0,0
|
||||
49,18.0,0
|
||||
50,39.0,0
|
||||
51,17.0,0
|
||||
52,28.0,0
|
||||
53,34.0,0
|
||||
54,26.0,0
|
||||
55,61.0,0
|
||||
56,28.0,0
|
||||
57,16.0,0
|
||||
58,45.0,0
|
||||
59,41.0,0
|
||||
60,49.0,0
|
||||
61,18.0,0
|
||||
62,40.0,0
|
||||
63,24.0,0
|
||||
64,37.0,0
|
||||
65,26.0,0
|
||||
66,51.0,0
|
||||
67,17.0,0
|
||||
68,152.0,0
|
||||
69,17.0,0
|
||||
70,29.0,0
|
||||
71,37.0,0
|
||||
72,15.0,0
|
||||
73,55.0,0
|
||||
74,152.0,0
|
||||
75,23.0,0
|
||||
76,45.0,0
|
||||
77,30.0,0
|
||||
78,39.0,0
|
||||
79,20.0,0
|
||||
80,53.0,0
|
||||
81,49.0,0
|
||||
82,71.0,0
|
||||
83,115.0,0
|
||||
84,41.0,0
|
||||
85,52.0,0
|
||||
86,52.0,0
|
||||
87,36.0,0
|
||||
88,84.0,0
|
||||
89,122.0,0
|
||||
90,49.0,0
|
||||
91,200.0,0
|
||||
92,67.0,0
|
||||
93,87.0,0
|
||||
94,183.0,0
|
||||
95,132.0,0
|
||||
96,76.0,0
|
||||
97,200.0,0
|
||||
98,200.0,0
|
||||
99,200.0,0
|
||||
100,200.0,0
|
||||
101,200.0,0
|
||||
102,106.0,0
|
||||
103,192.0,0
|
||||
104,111.0,0
|
||||
105,95.0,0
|
||||
106,200.0,0
|
||||
107,200.0,0
|
||||
108,148.0,0
|
||||
109,200.0,0
|
||||
110,97.0,0
|
||||
111,200.0,0
|
||||
112,200.0,0
|
||||
113,105.0,0
|
||||
114,135.0,0
|
||||
115,200.0,0
|
||||
116,144.0,0
|
||||
117,156.0,0
|
||||
118,200.0,0
|
||||
119,200.0,0
|
||||
120,166.0,0
|
||||
121,200.0,0
|
||||
122,200.0,0
|
||||
123,200.0,0
|
||||
124,200.0,0
|
||||
125,200.0,0
|
||||
126,200.0,0
|
||||
127,158.0,0
|
||||
128,139.0,0
|
||||
129,200.0,0
|
||||
130,200.0,0
|
||||
131,200.0,0
|
||||
132,200.0,0
|
||||
133,122.0,0
|
||||
134,200.0,0
|
||||
135,188.0,0
|
||||
136,200.0,0
|
||||
137,183.0,0
|
||||
138,200.0,0
|
||||
139,200.0,0
|
||||
140,200.0,0
|
||||
141,200.0,0
|
||||
142,200.0,0
|
||||
143,158.0,0
|
||||
144,200.0,0
|
||||
145,200.0,0
|
||||
146,200.0,0
|
||||
147,191.0,0
|
||||
148,200.0,0
|
||||
149,194.0,0
|
||||
150,178.0,0
|
||||
151,200.0,0
|
||||
152,200.0,0
|
||||
153,200.0,0
|
||||
154,162.0,0
|
||||
155,200.0,0
|
||||
156,200.0,0
|
||||
157,128.0,0
|
||||
158,200.0,0
|
||||
159,184.0,0
|
||||
160,194.0,0
|
||||
161,200.0,0
|
||||
162,200.0,0
|
||||
163,200.0,0
|
||||
164,200.0,0
|
||||
165,160.0,0
|
||||
166,163.0,0
|
||||
167,200.0,0
|
||||
168,200.0,0
|
||||
169,200.0,0
|
||||
170,141.0,0
|
||||
171,200.0,0
|
||||
172,200.0,0
|
||||
173,200.0,0
|
||||
174,200.0,0
|
||||
175,200.0,0
|
||||
176,200.0,0
|
||||
177,157.0,0
|
||||
178,164.0,0
|
||||
179,200.0,0
|
||||
180,200.0,0
|
||||
181,200.0,0
|
||||
182,200.0,0
|
||||
183,200.0,0
|
||||
184,200.0,0
|
||||
185,193.0,0
|
||||
186,182.0,0
|
||||
187,200.0,0
|
||||
188,200.0,0
|
||||
189,200.0,0
|
||||
190,200.0,0
|
||||
191,200.0,0
|
||||
192,174.0,0
|
||||
193,178.0,0
|
||||
194,200.0,0
|
||||
195,200.0,0
|
||||
196,200.0,0
|
||||
197,200.0,0
|
||||
198,200.0,0
|
||||
199,200.0,0
|
||||
|
@@ -1,125 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-11-07 18:10:37
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-07-21 21:52:31
|
||||
Discription:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import argparse
|
||||
|
||||
from common.utils import save_results,make_dir
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer
|
||||
from DoubleDQN.double_dqn import DoubleDQN
|
||||
|
||||
def get_args():
|
||||
""" 超参数
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # 保存模型的路径
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
|
||||
memory = ReplayBuffer(cfg.memory_capacity)
|
||||
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print("开始训练!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.sample(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
agent.memory.push(state, action, reward, next_state, done)
|
||||
state = next_state
|
||||
agent.update()
|
||||
if done:
|
||||
break
|
||||
if i_ep % cfg.target_update == 0:
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
if (i_ep+1)%10 == 0:
|
||||
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f},Epislon:{agent.epsilon:.3f}')
|
||||
rewards.append(ep_reward)
|
||||
print("完成训练!")
|
||||
return {'rewards':rewards}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print("开始测试!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
while True:
|
||||
action = agent.predict(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
|
||||
print("完成测试!")
|
||||
return {'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg,seed=1)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # 保存参数
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg,seed=1)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
||||
131
projects/codes/PolicyGradient/main.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:21:53
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-08-27 00:04:08
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import argparse
|
||||
from itertools import count
|
||||
import torch.nn.functional as F
|
||||
from pg import PolicyGradient
|
||||
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
|
||||
from common.models import MLP
|
||||
from common.memories import PGReplay
|
||||
from common.launcher import Launcher
|
||||
from envs.register import register_env
|
||||
|
||||
|
||||
class PGNet(MLP):
|
||||
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
|
||||
'''
|
||||
def forward(self, x):
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = torch.sigmoid(self.fc3(x))
|
||||
return x
|
||||
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
""" Hyperparameters
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
|
||||
parser.add_argument('--lr',default=0.01,type=float,help="learning rate")
|
||||
parser.add_argument('--update_fre',default=8,type=int)
|
||||
parser.add_argument('--hidden_dim',default=36,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=1,type=int,help="seed")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(self,cfg):
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg['seed'])
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"state dim: {n_states}, action dim: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
model = PGNet(n_states,1,hidden_dim=cfg['hidden_dim'])
|
||||
memory = PGReplay()
|
||||
agent = PolicyGradient(model,memory,cfg)
|
||||
return env,agent
|
||||
def train(self,cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = []
|
||||
for i_ep in range(cfg['train_eps']):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
if done:
|
||||
reward = 0
|
||||
agent.memory.push((state,float(action),reward))
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
if (i_ep+1) % 10 == 0:
|
||||
print(f"Episode:{i_ep+1}/{cfg['train_eps']}, Reward:{ep_reward:.2f}")
|
||||
if (i_ep+1) % cfg['update_fre'] == 0:
|
||||
agent.update()
|
||||
rewards.append(ep_reward)
|
||||
print('Finish training!')
|
||||
env.close() # close environment
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
return res_dic
|
||||
|
||||
def test(self,cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = []
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in range(cfg['ep_max_steps']):
|
||||
action = agent.predict_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
if done:
|
||||
reward = 0
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']},Reward: {ep_reward:.2f}")
|
||||
rewards.append(ep_reward)
|
||||
print("Finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"algo_name": "PolicyGradient",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 200,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.99,
|
||||
"lr": 0.005,
|
||||
"update_fre": 8,
|
||||
"hidden_dim": 36,
|
||||
"device": "cpu",
|
||||
"seed": 1,
|
||||
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/results/",
|
||||
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/models/",
|
||||
"save_fig": true,
|
||||
"show_fig": false
|
||||
}
|
||||
|
Before Width: | Height: | Size: 35 KiB |
|
Before Width: | Height: | Size: 66 KiB |
@@ -1,201 +0,0 @@
|
||||
episodes,rewards
|
||||
0,26.0
|
||||
1,53.0
|
||||
2,10.0
|
||||
3,37.0
|
||||
4,22.0
|
||||
5,21.0
|
||||
6,12.0
|
||||
7,34.0
|
||||
8,38.0
|
||||
9,40.0
|
||||
10,23.0
|
||||
11,14.0
|
||||
12,16.0
|
||||
13,25.0
|
||||
14,15.0
|
||||
15,23.0
|
||||
16,11.0
|
||||
17,28.0
|
||||
18,21.0
|
||||
19,62.0
|
||||
20,33.0
|
||||
21,27.0
|
||||
22,15.0
|
||||
23,17.0
|
||||
24,26.0
|
||||
25,35.0
|
||||
26,26.0
|
||||
27,14.0
|
||||
28,42.0
|
||||
29,45.0
|
||||
30,34.0
|
||||
31,39.0
|
||||
32,31.0
|
||||
33,17.0
|
||||
34,42.0
|
||||
35,41.0
|
||||
36,31.0
|
||||
37,39.0
|
||||
38,28.0
|
||||
39,12.0
|
||||
40,36.0
|
||||
41,33.0
|
||||
42,47.0
|
||||
43,40.0
|
||||
44,63.0
|
||||
45,36.0
|
||||
46,64.0
|
||||
47,79.0
|
||||
48,49.0
|
||||
49,40.0
|
||||
50,65.0
|
||||
51,47.0
|
||||
52,51.0
|
||||
53,30.0
|
||||
54,26.0
|
||||
55,41.0
|
||||
56,86.0
|
||||
57,61.0
|
||||
58,38.0
|
||||
59,200.0
|
||||
60,49.0
|
||||
61,70.0
|
||||
62,61.0
|
||||
63,101.0
|
||||
64,200.0
|
||||
65,152.0
|
||||
66,108.0
|
||||
67,46.0
|
||||
68,72.0
|
||||
69,87.0
|
||||
70,27.0
|
||||
71,126.0
|
||||
72,46.0
|
||||
73,25.0
|
||||
74,14.0
|
||||
75,42.0
|
||||
76,38.0
|
||||
77,55.0
|
||||
78,42.0
|
||||
79,51.0
|
||||
80,67.0
|
||||
81,83.0
|
||||
82,178.0
|
||||
83,115.0
|
||||
84,140.0
|
||||
85,97.0
|
||||
86,85.0
|
||||
87,61.0
|
||||
88,153.0
|
||||
89,200.0
|
||||
90,200.0
|
||||
91,200.0
|
||||
92,200.0
|
||||
93,64.0
|
||||
94,200.0
|
||||
95,200.0
|
||||
96,157.0
|
||||
97,128.0
|
||||
98,160.0
|
||||
99,35.0
|
||||
100,140.0
|
||||
101,113.0
|
||||
102,200.0
|
||||
103,154.0
|
||||
104,200.0
|
||||
105,200.0
|
||||
106,200.0
|
||||
107,198.0
|
||||
108,137.0
|
||||
109,200.0
|
||||
110,200.0
|
||||
111,102.0
|
||||
112,200.0
|
||||
113,200.0
|
||||
114,200.0
|
||||
115,200.0
|
||||
116,148.0
|
||||
117,200.0
|
||||
118,200.0
|
||||
119,200.0
|
||||
120,200.0
|
||||
121,200.0
|
||||
122,194.0
|
||||
123,200.0
|
||||
124,200.0
|
||||
125,200.0
|
||||
126,183.0
|
||||
127,200.0
|
||||
128,200.0
|
||||
129,200.0
|
||||
130,200.0
|
||||
131,200.0
|
||||
132,200.0
|
||||
133,200.0
|
||||
134,200.0
|
||||
135,200.0
|
||||
136,93.0
|
||||
137,96.0
|
||||
138,84.0
|
||||
139,103.0
|
||||
140,79.0
|
||||
141,104.0
|
||||
142,82.0
|
||||
143,105.0
|
||||
144,200.0
|
||||
145,200.0
|
||||
146,171.0
|
||||
147,200.0
|
||||
148,200.0
|
||||
149,200.0
|
||||
150,200.0
|
||||
151,197.0
|
||||
152,133.0
|
||||
153,142.0
|
||||
154,147.0
|
||||
155,156.0
|
||||
156,131.0
|
||||
157,181.0
|
||||
158,163.0
|
||||
159,146.0
|
||||
160,200.0
|
||||
161,176.0
|
||||
162,200.0
|
||||
163,173.0
|
||||
164,177.0
|
||||
165,200.0
|
||||
166,200.0
|
||||
167,200.0
|
||||
168,200.0
|
||||
169,200.0
|
||||
170,200.0
|
||||
171,200.0
|
||||
172,200.0
|
||||
173,200.0
|
||||
174,200.0
|
||||
175,200.0
|
||||
176,200.0
|
||||
177,200.0
|
||||
178,200.0
|
||||
179,200.0
|
||||
180,200.0
|
||||
181,200.0
|
||||
182,200.0
|
||||
183,200.0
|
||||
184,200.0
|
||||
185,200.0
|
||||
186,200.0
|
||||
187,200.0
|
||||
188,200.0
|
||||
189,200.0
|
||||
190,200.0
|
||||
191,200.0
|
||||
192,200.0
|
||||
193,200.0
|
||||
194,200.0
|
||||
195,200.0
|
||||
196,190.0
|
||||
197,200.0
|
||||
198,189.0
|
||||
199,200.0
|
||||
|
@@ -0,0 +1 @@
|
||||
{"algo_name": "PolicyGradient", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "lr": 0.01, "update_fre": 8, "hidden_dim": 36, "device": "cpu", "seed": 1, "save_fig": true, "show_fig": false, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\PolicyGradient/outputs/CartPole-v0/20220827-000433/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\PolicyGradient/outputs/CartPole-v0/20220827-000433/models/", "n_states": 4, "n_actions": 2}
|
||||
|
After Width: | Height: | Size: 28 KiB |
@@ -1,7 +1,7 @@
|
||||
episodes,rewards
|
||||
0,200.0
|
||||
1,200.0
|
||||
2,165.0
|
||||
2,200.0
|
||||
3,200.0
|
||||
4,200.0
|
||||
5,200.0
|
||||
@@ -10,12 +10,12 @@ episodes,rewards
|
||||
8,200.0
|
||||
9,200.0
|
||||
10,200.0
|
||||
11,168.0
|
||||
11,200.0
|
||||
12,200.0
|
||||
13,200.0
|
||||
14,200.0
|
||||
15,115.0
|
||||
16,198.0
|
||||
15,200.0
|
||||
16,200.0
|
||||
17,200.0
|
||||
18,200.0
|
||||
19,200.0
|
||||
|
|
After Width: | Height: | Size: 60 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards
|
||||
0,26.0
|
||||
1,53.0
|
||||
2,10.0
|
||||
3,37.0
|
||||
4,22.0
|
||||
5,21.0
|
||||
6,12.0
|
||||
7,34.0
|
||||
8,93.0
|
||||
9,36.0
|
||||
10,29.0
|
||||
11,18.0
|
||||
12,14.0
|
||||
13,62.0
|
||||
14,20.0
|
||||
15,40.0
|
||||
16,10.0
|
||||
17,10.0
|
||||
18,10.0
|
||||
19,11.0
|
||||
20,10.0
|
||||
21,14.0
|
||||
22,12.0
|
||||
23,8.0
|
||||
24,19.0
|
||||
25,33.0
|
||||
26,22.0
|
||||
27,32.0
|
||||
28,16.0
|
||||
29,24.0
|
||||
30,24.0
|
||||
31,24.0
|
||||
32,75.0
|
||||
33,33.0
|
||||
34,33.0
|
||||
35,72.0
|
||||
36,110.0
|
||||
37,48.0
|
||||
38,60.0
|
||||
39,43.0
|
||||
40,61.0
|
||||
41,34.0
|
||||
42,50.0
|
||||
43,61.0
|
||||
44,53.0
|
||||
45,58.0
|
||||
46,36.0
|
||||
47,44.0
|
||||
48,42.0
|
||||
49,64.0
|
||||
50,67.0
|
||||
51,52.0
|
||||
52,39.0
|
||||
53,42.0
|
||||
54,40.0
|
||||
55,33.0
|
||||
56,200.0
|
||||
57,199.0
|
||||
58,149.0
|
||||
59,185.0
|
||||
60,134.0
|
||||
61,174.0
|
||||
62,162.0
|
||||
63,200.0
|
||||
64,93.0
|
||||
65,72.0
|
||||
66,69.0
|
||||
67,51.0
|
||||
68,62.0
|
||||
69,98.0
|
||||
70,73.0
|
||||
71,73.0
|
||||
72,200.0
|
||||
73,200.0
|
||||
74,200.0
|
||||
75,200.0
|
||||
76,200.0
|
||||
77,200.0
|
||||
78,200.0
|
||||
79,133.0
|
||||
80,200.0
|
||||
81,200.0
|
||||
82,200.0
|
||||
83,200.0
|
||||
84,200.0
|
||||
85,200.0
|
||||
86,200.0
|
||||
87,200.0
|
||||
88,114.0
|
||||
89,151.0
|
||||
90,129.0
|
||||
91,156.0
|
||||
92,112.0
|
||||
93,172.0
|
||||
94,171.0
|
||||
95,141.0
|
||||
96,200.0
|
||||
97,200.0
|
||||
98,200.0
|
||||
99,200.0
|
||||
100,200.0
|
||||
101,200.0
|
||||
102,200.0
|
||||
103,200.0
|
||||
104,188.0
|
||||
105,199.0
|
||||
106,138.0
|
||||
107,200.0
|
||||
108,200.0
|
||||
109,181.0
|
||||
110,145.0
|
||||
111,200.0
|
||||
112,135.0
|
||||
113,119.0
|
||||
114,112.0
|
||||
115,122.0
|
||||
116,118.0
|
||||
117,119.0
|
||||
118,131.0
|
||||
119,119.0
|
||||
120,109.0
|
||||
121,96.0
|
||||
122,105.0
|
||||
123,29.0
|
||||
124,110.0
|
||||
125,113.0
|
||||
126,18.0
|
||||
127,90.0
|
||||
128,145.0
|
||||
129,152.0
|
||||
130,151.0
|
||||
131,109.0
|
||||
132,141.0
|
||||
133,109.0
|
||||
134,136.0
|
||||
135,143.0
|
||||
136,200.0
|
||||
137,200.0
|
||||
138,200.0
|
||||
139,200.0
|
||||
140,200.0
|
||||
141,200.0
|
||||
142,200.0
|
||||
143,200.0
|
||||
144,192.0
|
||||
145,173.0
|
||||
146,180.0
|
||||
147,182.0
|
||||
148,186.0
|
||||
149,175.0
|
||||
150,176.0
|
||||
151,191.0
|
||||
152,200.0
|
||||
153,200.0
|
||||
154,200.0
|
||||
155,200.0
|
||||
156,200.0
|
||||
157,200.0
|
||||
158,200.0
|
||||
159,200.0
|
||||
160,200.0
|
||||
161,200.0
|
||||
162,200.0
|
||||
163,200.0
|
||||
164,200.0
|
||||
165,200.0
|
||||
166,200.0
|
||||
167,200.0
|
||||
168,200.0
|
||||
169,200.0
|
||||
170,200.0
|
||||
171,200.0
|
||||
172,200.0
|
||||
173,200.0
|
||||
174,200.0
|
||||
175,200.0
|
||||
176,200.0
|
||||
177,200.0
|
||||
178,200.0
|
||||
179,200.0
|
||||
180,200.0
|
||||
181,200.0
|
||||
182,200.0
|
||||
183,200.0
|
||||
184,200.0
|
||||
185,200.0
|
||||
186,200.0
|
||||
187,200.0
|
||||
188,200.0
|
||||
189,200.0
|
||||
190,200.0
|
||||
191,200.0
|
||||
192,200.0
|
||||
193,200.0
|
||||
194,200.0
|
||||
195,200.0
|
||||
196,200.0
|
||||
197,200.0
|
||||
198,200.0
|
||||
199,200.0
|
||||
|
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:27:44
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-08-22 17:35:34
|
||||
LastEditTime: 2022-08-27 13:45:26
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -19,20 +19,23 @@ import numpy as np
|
||||
|
||||
class PolicyGradient:
|
||||
|
||||
def __init__(self, n_states,model,memory,cfg):
|
||||
self.gamma = cfg.gamma
|
||||
self.device = torch.device(cfg.device)
|
||||
def __init__(self, model,memory,cfg):
|
||||
self.gamma = cfg['gamma']
|
||||
self.device = torch.device(cfg['device'])
|
||||
self.memory = memory
|
||||
self.policy_net = model.to(self.device)
|
||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg['lr'])
|
||||
|
||||
def sample_action(self,state):
|
||||
|
||||
state = torch.from_numpy(state).float()
|
||||
state = Variable(state)
|
||||
probs = self.policy_net(state)
|
||||
print("probs")
|
||||
print(probs)
|
||||
m = Bernoulli(probs) # 伯努利分布
|
||||
action = m.sample()
|
||||
|
||||
action = action.data.numpy().astype(int)[0] # 转为标量
|
||||
return action
|
||||
def predict_action(self,state):
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:21:53
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-08-22 17:40:07
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import argparse
|
||||
from itertools import count
|
||||
import torch.nn.functional as F
|
||||
from pg import PolicyGradient
|
||||
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
|
||||
from common.models import MLP
|
||||
from common.memories import PGReplay
|
||||
|
||||
|
||||
def get_args():
|
||||
""" Hyperparameters
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
|
||||
parser.add_argument('--lr',default=0.005,type=float,help="learning rate")
|
||||
parser.add_argument('--update_fre',default=8,type=int)
|
||||
parser.add_argument('--hidden_dim',default=36,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=1,type=int,help="seed")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # path to save models
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
args = parser.parse_args([])
|
||||
return args
|
||||
|
||||
class PGNet(MLP):
|
||||
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
|
||||
'''
|
||||
def forward(self, x):
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = F.sigmoid(self.fc3(x))
|
||||
return x
|
||||
|
||||
def env_agent_config(cfg):
|
||||
env = gym.make(cfg.env_name)
|
||||
if cfg.seed !=0: # set random seed
|
||||
all_seed(env,seed=cfg.seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"state dim: {n_states}, action dim: {n_actions}")
|
||||
model = PGNet(n_states,1,hidden_dim=cfg.hidden_dim)
|
||||
memory = PGReplay()
|
||||
agent = PolicyGradient(n_states,model,memory,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('Start training!')
|
||||
print(f'Env:{cfg.env_name}, Algo:{cfg.algo_name}, Device:{cfg.device}')
|
||||
rewards = []
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in count():
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
if done:
|
||||
reward = 0
|
||||
agent.memory.push((state,float(action),reward))
|
||||
state = next_state
|
||||
if done:
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
|
||||
break
|
||||
if (i_ep+1) % cfg.update_fre == 0:
|
||||
agent.update()
|
||||
rewards.append(ep_reward)
|
||||
print('Finish training!')
|
||||
env.close() # close environment
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
return res_dic
|
||||
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print("start testing!")
|
||||
print(f"Env: {cfg.env_name}, Algo: {cfg.algo_name}, Device: {cfg.device}")
|
||||
rewards = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in count():
|
||||
action = agent.predict_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
if done:
|
||||
reward = 0
|
||||
state = next_state
|
||||
if done:
|
||||
print(f'Episode: {i_ep+1}/{cfg.test_eps},Reward: {ep_reward:.2f}')
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print("finish testing!")
|
||||
env.close()
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
save_args(cfg,path = cfg.result_path) # save parameters
|
||||
agent.save_model(path = cfg.model_path) # save models
|
||||
save_results(res_dic, tag = 'train', path = cfg.result_path) # save results
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train") # plot results
|
||||
# testing
|
||||
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
|
||||
agent.load_model(path = cfg.model_path) # load model
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path = cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test")
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-09-11 23:03:00
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-08-24 11:27:01
|
||||
LastEditTime: 2022-08-26 22:46:21
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -18,136 +18,105 @@ sys.path.append(parent_path) # add path to system path
|
||||
import gym
|
||||
import datetime
|
||||
import argparse
|
||||
from envs.gridworld_env import CliffWalkingWapper,FrozenLakeWapper
|
||||
from envs.gridworld_env import FrozenLakeWapper
|
||||
from envs.wrappers import CliffWalkingWapper
|
||||
from envs.register import register_env
|
||||
from qlearning import QLearning
|
||||
from common.utils import plot_rewards,save_args,all_seed
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
def get_args():
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
if cfg['env_name'] == 'CliffWalking-v0':
|
||||
env = gym.make(cfg['env_name'])
|
||||
env = CliffWalkingWapper(env)
|
||||
if cfg['env_name'] == 'FrozenLake-v1':
|
||||
env = gym.make(cfg['env_name'],is_slippery=False)
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
n_states = env.observation_space.n # state dimension
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
agent = QLearning(cfg)
|
||||
return env,agent
|
||||
|
||||
def main(cfg,env,agent,tag = 'train'):
|
||||
print(f"Start {tag}ing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # 记录奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录每个回合的奖励
|
||||
state = env.reset() # 重置环境,即开始新的回合
|
||||
while True:
|
||||
if tag == 'train':action = agent.sample_action(state) # 根据算法采样一个动作
|
||||
else: agent.predict_action(state)
|
||||
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
|
||||
if tag == 'train':agent.update(state, action, reward, next_state, done) # Q学习算法更新
|
||||
state = next_state # 更新状态
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f},Epsilon:{agent.epsilon}")
|
||||
print(f"Finish {tag}ing!")
|
||||
return {"rewards":rewards}
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
for i_ep in range(cfg['train_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0 # step per episode
|
||||
state = env.reset() # reset and obtain initial state
|
||||
while True:
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
agent.update(state, action, reward, next_state, done) # update agent
|
||||
state = next_state # update state
|
||||
ep_reward += reward
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
if (i_ep+1)%10==0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
while True:
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
from common.utils import all_seed
|
||||
from common.launcher import Launcher
|
||||
|
||||
class Main(Launcher):
|
||||
def get_args(self):
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=10,type=int,help="seed")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
|
||||
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
|
||||
}
|
||||
args = {**vars(args),**default_args} # type(dict)
|
||||
return args
|
||||
def env_agent_config(self,cfg):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg['env_name'])
|
||||
env = gym.make(cfg['env_name'])
|
||||
if cfg['env_name'] == 'CliffWalking-v0':
|
||||
env = CliffWalkingWapper(env)
|
||||
if cfg['seed'] !=0: # set random seed
|
||||
all_seed(env,seed=cfg["seed"])
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"n_states: {n_states}, n_actions: {n_actions}")
|
||||
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
agent = QLearning(cfg)
|
||||
return env,agent
|
||||
def train(self,cfg,env,agent):
|
||||
print("Start training!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
for i_ep in range(cfg['train_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0 # step per episode
|
||||
state = env.reset() # reset and obtain initial state
|
||||
while True:
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action) # update env and return transitions
|
||||
agent.update(state, action, reward, next_state, done) # update agent
|
||||
state = next_state # update state
|
||||
ep_reward += reward
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
if (i_ep+1)%10==0:
|
||||
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}')
|
||||
print("Finish training!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
def test(self,cfg,env,agent):
|
||||
print("Start testing!")
|
||||
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
|
||||
rewards = [] # record rewards for all episodes
|
||||
steps = [] # record steps for all episodes
|
||||
for i_ep in range(cfg['test_eps']):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
while True:
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
ep_step += 1
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
steps.append(ep_step)
|
||||
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
|
||||
print("Finish testing!")
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# training
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
save_args(cfg,path = cfg['result_path']) # save parameters
|
||||
agent.save_model(path = cfg['model_path']) # save models
|
||||
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
|
||||
# testing
|
||||
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
|
||||
agent.load_model(path = cfg['model_path']) # load model
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path = cfg['result_path'])
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "test")
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
episodes,rewards
|
||||
0,-13
|
||||
1,-13
|
||||
2,-13
|
||||
3,-13
|
||||
4,-13
|
||||
5,-13
|
||||
6,-13
|
||||
7,-13
|
||||
8,-13
|
||||
9,-13
|
||||
10,-13
|
||||
11,-13
|
||||
12,-13
|
||||
13,-13
|
||||
14,-13
|
||||
15,-13
|
||||
16,-13
|
||||
17,-13
|
||||
18,-13
|
||||
19,-13
|
||||
|
@@ -1,401 +0,0 @@
|
||||
episodes,rewards
|
||||
0,-2131
|
||||
1,-1086
|
||||
2,-586
|
||||
3,-220
|
||||
4,-154
|
||||
5,-122
|
||||
6,-150
|
||||
7,-159
|
||||
8,-164
|
||||
9,-88
|
||||
10,-195
|
||||
11,-114
|
||||
12,-60
|
||||
13,-179
|
||||
14,-101
|
||||
15,-304
|
||||
16,-96
|
||||
17,-119
|
||||
18,-113
|
||||
19,-98
|
||||
20,-106
|
||||
21,-105
|
||||
22,-77
|
||||
23,-51
|
||||
24,-105
|
||||
25,-136
|
||||
26,-100
|
||||
27,-29
|
||||
28,-79
|
||||
29,-114
|
||||
30,-82
|
||||
31,-70
|
||||
32,-75
|
||||
33,-51
|
||||
34,-94
|
||||
35,-52
|
||||
36,-93
|
||||
37,-71
|
||||
38,-73
|
||||
39,-48
|
||||
40,-52
|
||||
41,-96
|
||||
42,-46
|
||||
43,-65
|
||||
44,-57
|
||||
45,-41
|
||||
46,-104
|
||||
47,-51
|
||||
48,-181
|
||||
49,-229
|
||||
50,-39
|
||||
51,-69
|
||||
52,-53
|
||||
53,-59
|
||||
54,-26
|
||||
55,-75
|
||||
56,-31
|
||||
57,-60
|
||||
58,-63
|
||||
59,-40
|
||||
60,-35
|
||||
61,-79
|
||||
62,-42
|
||||
63,-22
|
||||
64,-73
|
||||
65,-71
|
||||
66,-18
|
||||
67,-55
|
||||
68,-29
|
||||
69,-43
|
||||
70,-70
|
||||
71,-49
|
||||
72,-42
|
||||
73,-29
|
||||
74,-81
|
||||
75,-36
|
||||
76,-38
|
||||
77,-36
|
||||
78,-52
|
||||
79,-28
|
||||
80,-42
|
||||
81,-52
|
||||
82,-66
|
||||
83,-31
|
||||
84,-27
|
||||
85,-49
|
||||
86,-28
|
||||
87,-54
|
||||
88,-34
|
||||
89,-35
|
||||
90,-50
|
||||
91,-36
|
||||
92,-36
|
||||
93,-46
|
||||
94,-34
|
||||
95,-135
|
||||
96,-39
|
||||
97,-36
|
||||
98,-26
|
||||
99,-56
|
||||
100,-40
|
||||
101,-40
|
||||
102,-26
|
||||
103,-28
|
||||
104,-31
|
||||
105,-35
|
||||
106,-26
|
||||
107,-57
|
||||
108,-44
|
||||
109,-41
|
||||
110,-31
|
||||
111,-26
|
||||
112,-25
|
||||
113,-41
|
||||
114,-32
|
||||
115,-44
|
||||
116,-30
|
||||
117,-32
|
||||
118,-30
|
||||
119,-25
|
||||
120,-23
|
||||
121,-47
|
||||
122,-24
|
||||
123,-45
|
||||
124,-39
|
||||
125,-21
|
||||
126,-43
|
||||
127,-143
|
||||
128,-26
|
||||
129,-20
|
||||
130,-32
|
||||
131,-16
|
||||
132,-24
|
||||
133,-42
|
||||
134,-25
|
||||
135,-36
|
||||
136,-19
|
||||
137,-29
|
||||
138,-43
|
||||
139,-17
|
||||
140,-150
|
||||
141,-32
|
||||
142,-34
|
||||
143,-19
|
||||
144,-26
|
||||
145,-30
|
||||
146,-31
|
||||
147,-49
|
||||
148,-33
|
||||
149,-21
|
||||
150,-17
|
||||
151,-48
|
||||
152,-34
|
||||
153,-20
|
||||
154,-20
|
||||
155,-26
|
||||
156,-21
|
||||
157,-13
|
||||
158,-40
|
||||
159,-22
|
||||
160,-26
|
||||
161,-30
|
||||
162,-29
|
||||
163,-25
|
||||
164,-26
|
||||
165,-27
|
||||
166,-21
|
||||
167,-29
|
||||
168,-24
|
||||
169,-17
|
||||
170,-22
|
||||
171,-35
|
||||
172,-35
|
||||
173,-18
|
||||
174,-135
|
||||
175,-15
|
||||
176,-23
|
||||
177,-28
|
||||
178,-25
|
||||
179,-24
|
||||
180,-29
|
||||
181,-31
|
||||
182,-24
|
||||
183,-129
|
||||
184,-45
|
||||
185,-24
|
||||
186,-17
|
||||
187,-20
|
||||
188,-21
|
||||
189,-23
|
||||
190,-15
|
||||
191,-32
|
||||
192,-22
|
||||
193,-19
|
||||
194,-17
|
||||
195,-45
|
||||
196,-15
|
||||
197,-14
|
||||
198,-14
|
||||
199,-37
|
||||
200,-23
|
||||
201,-17
|
||||
202,-19
|
||||
203,-21
|
||||
204,-23
|
||||
205,-27
|
||||
206,-14
|
||||
207,-18
|
||||
208,-23
|
||||
209,-34
|
||||
210,-23
|
||||
211,-13
|
||||
212,-25
|
||||
213,-17
|
||||
214,-13
|
||||
215,-21
|
||||
216,-29
|
||||
217,-18
|
||||
218,-24
|
||||
219,-15
|
||||
220,-27
|
||||
221,-25
|
||||
222,-21
|
||||
223,-19
|
||||
224,-17
|
||||
225,-18
|
||||
226,-13
|
||||
227,-22
|
||||
228,-14
|
||||
229,-13
|
||||
230,-29
|
||||
231,-23
|
||||
232,-15
|
||||
233,-15
|
||||
234,-14
|
||||
235,-28
|
||||
236,-25
|
||||
237,-17
|
||||
238,-23
|
||||
239,-29
|
||||
240,-15
|
||||
241,-14
|
||||
242,-15
|
||||
243,-23
|
||||
244,-15
|
||||
245,-16
|
||||
246,-19
|
||||
247,-13
|
||||
248,-16
|
||||
249,-17
|
||||
250,-25
|
||||
251,-30
|
||||
252,-13
|
||||
253,-14
|
||||
254,-15
|
||||
255,-22
|
||||
256,-14
|
||||
257,-17
|
||||
258,-126
|
||||
259,-15
|
||||
260,-21
|
||||
261,-16
|
||||
262,-23
|
||||
263,-14
|
||||
264,-13
|
||||
265,-13
|
||||
266,-19
|
||||
267,-13
|
||||
268,-19
|
||||
269,-17
|
||||
270,-17
|
||||
271,-13
|
||||
272,-19
|
||||
273,-13
|
||||
274,-13
|
||||
275,-16
|
||||
276,-22
|
||||
277,-14
|
||||
278,-15
|
||||
279,-19
|
||||
280,-34
|
||||
281,-13
|
||||
282,-15
|
||||
283,-32
|
||||
284,-13
|
||||
285,-13
|
||||
286,-13
|
||||
287,-14
|
||||
288,-16
|
||||
289,-13
|
||||
290,-13
|
||||
291,-17
|
||||
292,-13
|
||||
293,-13
|
||||
294,-22
|
||||
295,-14
|
||||
296,-15
|
||||
297,-13
|
||||
298,-13
|
||||
299,-13
|
||||
300,-16
|
||||
301,-13
|
||||
302,-14
|
||||
303,-13
|
||||
304,-13
|
||||
305,-13
|
||||
306,-24
|
||||
307,-13
|
||||
308,-13
|
||||
309,-15
|
||||
310,-13
|
||||
311,-13
|
||||
312,-13
|
||||
313,-15
|
||||
314,-13
|
||||
315,-19
|
||||
316,-15
|
||||
317,-17
|
||||
318,-13
|
||||
319,-13
|
||||
320,-13
|
||||
321,-13
|
||||
322,-13
|
||||
323,-15
|
||||
324,-13
|
||||
325,-13
|
||||
326,-13
|
||||
327,-123
|
||||
328,-13
|
||||
329,-13
|
||||
330,-13
|
||||
331,-13
|
||||
332,-13
|
||||
333,-13
|
||||
334,-13
|
||||
335,-13
|
||||
336,-16
|
||||
337,-13
|
||||
338,-23
|
||||
339,-13
|
||||
340,-13
|
||||
341,-13
|
||||
342,-13
|
||||
343,-13
|
||||
344,-13
|
||||
345,-13
|
||||
346,-13
|
||||
347,-13
|
||||
348,-13
|
||||
349,-13
|
||||
350,-134
|
||||
351,-13
|
||||
352,-13
|
||||
353,-13
|
||||
354,-13
|
||||
355,-13
|
||||
356,-13
|
||||
357,-13
|
||||
358,-13
|
||||
359,-13
|
||||
360,-15
|
||||
361,-13
|
||||
362,-13
|
||||
363,-13
|
||||
364,-13
|
||||
365,-13
|
||||
366,-13
|
||||
367,-13
|
||||
368,-13
|
||||
369,-14
|
||||
370,-13
|
||||
371,-13
|
||||
372,-13
|
||||
373,-13
|
||||
374,-13
|
||||
375,-13
|
||||
376,-13
|
||||
377,-124
|
||||
378,-13
|
||||
379,-13
|
||||
380,-13
|
||||
381,-13
|
||||
382,-13
|
||||
383,-13
|
||||
384,-13
|
||||
385,-13
|
||||
386,-13
|
||||
387,-13
|
||||
388,-13
|
||||
389,-121
|
||||
390,-13
|
||||
391,-13
|
||||
392,-13
|
||||
393,-13
|
||||
394,-13
|
||||
395,-13
|
||||
396,-13
|
||||
397,-13
|
||||
398,-17
|
||||
399,-13
|
||||
|
@@ -0,0 +1 @@
|
||||
{"algo_name": "Q-learning", "env_name": "CliffWalking-v0", "train_eps": 400, "test_eps": 20, "gamma": 0.9, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 300, "lr": 0.1, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/CliffWalking-v0/20220826-224730/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/CliffWalking-v0/20220826-224730/models/", "n_states": 48, "n_actions": 4}
|
||||
|
Before Width: | Height: | Size: 24 KiB After Width: | Height: | Size: 24 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards,steps
|
||||
0,-13,13
|
||||
1,-13,13
|
||||
2,-13,13
|
||||
3,-13,13
|
||||
4,-13,13
|
||||
5,-13,13
|
||||
6,-13,13
|
||||
7,-13,13
|
||||
8,-13,13
|
||||
9,-13,13
|
||||
10,-13,13
|
||||
11,-13,13
|
||||
12,-13,13
|
||||
13,-13,13
|
||||
14,-13,13
|
||||
15,-13,13
|
||||
16,-13,13
|
||||
17,-13,13
|
||||
18,-13,13
|
||||
19,-13,13
|
||||
|
|
Before Width: | Height: | Size: 35 KiB After Width: | Height: | Size: 35 KiB |
@@ -0,0 +1,401 @@
|
||||
episodes,rewards,steps
|
||||
0,-2131,448
|
||||
1,-1086,492
|
||||
2,-586,388
|
||||
3,-220,220
|
||||
4,-154,154
|
||||
5,-122,122
|
||||
6,-150,150
|
||||
7,-159,159
|
||||
8,-164,164
|
||||
9,-88,88
|
||||
10,-195,195
|
||||
11,-114,114
|
||||
12,-60,60
|
||||
13,-179,179
|
||||
14,-101,101
|
||||
15,-304,205
|
||||
16,-96,96
|
||||
17,-119,119
|
||||
18,-113,113
|
||||
19,-98,98
|
||||
20,-106,106
|
||||
21,-105,105
|
||||
22,-77,77
|
||||
23,-51,51
|
||||
24,-105,105
|
||||
25,-136,136
|
||||
26,-100,100
|
||||
27,-29,29
|
||||
28,-79,79
|
||||
29,-114,114
|
||||
30,-82,82
|
||||
31,-70,70
|
||||
32,-75,75
|
||||
33,-51,51
|
||||
34,-94,94
|
||||
35,-52,52
|
||||
36,-93,93
|
||||
37,-71,71
|
||||
38,-73,73
|
||||
39,-48,48
|
||||
40,-52,52
|
||||
41,-96,96
|
||||
42,-46,46
|
||||
43,-65,65
|
||||
44,-57,57
|
||||
45,-41,41
|
||||
46,-104,104
|
||||
47,-51,51
|
||||
48,-181,82
|
||||
49,-229,130
|
||||
50,-39,39
|
||||
51,-69,69
|
||||
52,-53,53
|
||||
53,-59,59
|
||||
54,-26,26
|
||||
55,-75,75
|
||||
56,-31,31
|
||||
57,-60,60
|
||||
58,-63,63
|
||||
59,-40,40
|
||||
60,-35,35
|
||||
61,-79,79
|
||||
62,-42,42
|
||||
63,-22,22
|
||||
64,-73,73
|
||||
65,-71,71
|
||||
66,-18,18
|
||||
67,-55,55
|
||||
68,-29,29
|
||||
69,-43,43
|
||||
70,-70,70
|
||||
71,-49,49
|
||||
72,-42,42
|
||||
73,-29,29
|
||||
74,-81,81
|
||||
75,-36,36
|
||||
76,-38,38
|
||||
77,-36,36
|
||||
78,-52,52
|
||||
79,-28,28
|
||||
80,-42,42
|
||||
81,-52,52
|
||||
82,-66,66
|
||||
83,-31,31
|
||||
84,-27,27
|
||||
85,-49,49
|
||||
86,-28,28
|
||||
87,-54,54
|
||||
88,-34,34
|
||||
89,-35,35
|
||||
90,-50,50
|
||||
91,-36,36
|
||||
92,-36,36
|
||||
93,-46,46
|
||||
94,-34,34
|
||||
95,-135,36
|
||||
96,-39,39
|
||||
97,-36,36
|
||||
98,-26,26
|
||||
99,-56,56
|
||||
100,-40,40
|
||||
101,-40,40
|
||||
102,-26,26
|
||||
103,-28,28
|
||||
104,-31,31
|
||||
105,-35,35
|
||||
106,-26,26
|
||||
107,-57,57
|
||||
108,-44,44
|
||||
109,-41,41
|
||||
110,-31,31
|
||||
111,-26,26
|
||||
112,-25,25
|
||||
113,-41,41
|
||||
114,-32,32
|
||||
115,-44,44
|
||||
116,-30,30
|
||||
117,-32,32
|
||||
118,-30,30
|
||||
119,-25,25
|
||||
120,-23,23
|
||||
121,-47,47
|
||||
122,-24,24
|
||||
123,-45,45
|
||||
124,-39,39
|
||||
125,-21,21
|
||||
126,-43,43
|
||||
127,-143,44
|
||||
128,-26,26
|
||||
129,-20,20
|
||||
130,-32,32
|
||||
131,-16,16
|
||||
132,-24,24
|
||||
133,-42,42
|
||||
134,-25,25
|
||||
135,-36,36
|
||||
136,-19,19
|
||||
137,-29,29
|
||||
138,-43,43
|
||||
139,-17,17
|
||||
140,-150,51
|
||||
141,-32,32
|
||||
142,-34,34
|
||||
143,-19,19
|
||||
144,-26,26
|
||||
145,-30,30
|
||||
146,-31,31
|
||||
147,-49,49
|
||||
148,-33,33
|
||||
149,-21,21
|
||||
150,-17,17
|
||||
151,-48,48
|
||||
152,-34,34
|
||||
153,-20,20
|
||||
154,-20,20
|
||||
155,-26,26
|
||||
156,-21,21
|
||||
157,-13,13
|
||||
158,-40,40
|
||||
159,-22,22
|
||||
160,-26,26
|
||||
161,-30,30
|
||||
162,-29,29
|
||||
163,-25,25
|
||||
164,-26,26
|
||||
165,-27,27
|
||||
166,-21,21
|
||||
167,-29,29
|
||||
168,-24,24
|
||||
169,-17,17
|
||||
170,-22,22
|
||||
171,-35,35
|
||||
172,-35,35
|
||||
173,-18,18
|
||||
174,-135,36
|
||||
175,-15,15
|
||||
176,-23,23
|
||||
177,-28,28
|
||||
178,-25,25
|
||||
179,-24,24
|
||||
180,-29,29
|
||||
181,-31,31
|
||||
182,-24,24
|
||||
183,-129,30
|
||||
184,-45,45
|
||||
185,-24,24
|
||||
186,-17,17
|
||||
187,-20,20
|
||||
188,-21,21
|
||||
189,-23,23
|
||||
190,-15,15
|
||||
191,-32,32
|
||||
192,-22,22
|
||||
193,-19,19
|
||||
194,-17,17
|
||||
195,-45,45
|
||||
196,-15,15
|
||||
197,-14,14
|
||||
198,-14,14
|
||||
199,-37,37
|
||||
200,-23,23
|
||||
201,-17,17
|
||||
202,-19,19
|
||||
203,-21,21
|
||||
204,-23,23
|
||||
205,-27,27
|
||||
206,-14,14
|
||||
207,-18,18
|
||||
208,-23,23
|
||||
209,-34,34
|
||||
210,-23,23
|
||||
211,-13,13
|
||||
212,-25,25
|
||||
213,-17,17
|
||||
214,-13,13
|
||||
215,-21,21
|
||||
216,-29,29
|
||||
217,-18,18
|
||||
218,-24,24
|
||||
219,-15,15
|
||||
220,-27,27
|
||||
221,-25,25
|
||||
222,-21,21
|
||||
223,-19,19
|
||||
224,-17,17
|
||||
225,-18,18
|
||||
226,-13,13
|
||||
227,-22,22
|
||||
228,-14,14
|
||||
229,-13,13
|
||||
230,-29,29
|
||||
231,-23,23
|
||||
232,-15,15
|
||||
233,-15,15
|
||||
234,-14,14
|
||||
235,-28,28
|
||||
236,-25,25
|
||||
237,-17,17
|
||||
238,-23,23
|
||||
239,-29,29
|
||||
240,-15,15
|
||||
241,-14,14
|
||||
242,-15,15
|
||||
243,-23,23
|
||||
244,-15,15
|
||||
245,-16,16
|
||||
246,-19,19
|
||||
247,-13,13
|
||||
248,-16,16
|
||||
249,-17,17
|
||||
250,-25,25
|
||||
251,-30,30
|
||||
252,-13,13
|
||||
253,-14,14
|
||||
254,-15,15
|
||||
255,-22,22
|
||||
256,-14,14
|
||||
257,-17,17
|
||||
258,-126,27
|
||||
259,-15,15
|
||||
260,-21,21
|
||||
261,-16,16
|
||||
262,-23,23
|
||||
263,-14,14
|
||||
264,-13,13
|
||||
265,-13,13
|
||||
266,-19,19
|
||||
267,-13,13
|
||||
268,-19,19
|
||||
269,-17,17
|
||||
270,-17,17
|
||||
271,-13,13
|
||||
272,-19,19
|
||||
273,-13,13
|
||||
274,-13,13
|
||||
275,-16,16
|
||||
276,-22,22
|
||||
277,-14,14
|
||||
278,-15,15
|
||||
279,-19,19
|
||||
280,-34,34
|
||||
281,-13,13
|
||||
282,-15,15
|
||||
283,-32,32
|
||||
284,-13,13
|
||||
285,-13,13
|
||||
286,-13,13
|
||||
287,-14,14
|
||||
288,-16,16
|
||||
289,-13,13
|
||||
290,-13,13
|
||||
291,-17,17
|
||||
292,-13,13
|
||||
293,-13,13
|
||||
294,-22,22
|
||||
295,-14,14
|
||||
296,-15,15
|
||||
297,-13,13
|
||||
298,-13,13
|
||||
299,-13,13
|
||||
300,-16,16
|
||||
301,-13,13
|
||||
302,-14,14
|
||||
303,-13,13
|
||||
304,-13,13
|
||||
305,-13,13
|
||||
306,-24,24
|
||||
307,-13,13
|
||||
308,-13,13
|
||||
309,-15,15
|
||||
310,-13,13
|
||||
311,-13,13
|
||||
312,-13,13
|
||||
313,-15,15
|
||||
314,-13,13
|
||||
315,-19,19
|
||||
316,-15,15
|
||||
317,-17,17
|
||||
318,-13,13
|
||||
319,-13,13
|
||||
320,-13,13
|
||||
321,-13,13
|
||||
322,-13,13
|
||||
323,-15,15
|
||||
324,-13,13
|
||||
325,-13,13
|
||||
326,-13,13
|
||||
327,-123,24
|
||||
328,-13,13
|
||||
329,-13,13
|
||||
330,-13,13
|
||||
331,-13,13
|
||||
332,-13,13
|
||||
333,-13,13
|
||||
334,-13,13
|
||||
335,-13,13
|
||||
336,-16,16
|
||||
337,-13,13
|
||||
338,-23,23
|
||||
339,-13,13
|
||||
340,-13,13
|
||||
341,-13,13
|
||||
342,-13,13
|
||||
343,-13,13
|
||||
344,-13,13
|
||||
345,-13,13
|
||||
346,-13,13
|
||||
347,-13,13
|
||||
348,-13,13
|
||||
349,-13,13
|
||||
350,-134,35
|
||||
351,-13,13
|
||||
352,-13,13
|
||||
353,-13,13
|
||||
354,-13,13
|
||||
355,-13,13
|
||||
356,-13,13
|
||||
357,-13,13
|
||||
358,-13,13
|
||||
359,-13,13
|
||||
360,-15,15
|
||||
361,-13,13
|
||||
362,-13,13
|
||||
363,-13,13
|
||||
364,-13,13
|
||||
365,-13,13
|
||||
366,-13,13
|
||||
367,-13,13
|
||||
368,-13,13
|
||||
369,-14,14
|
||||
370,-13,13
|
||||
371,-13,13
|
||||
372,-13,13
|
||||
373,-13,13
|
||||
374,-13,13
|
||||
375,-13,13
|
||||
376,-13,13
|
||||
377,-124,25
|
||||
378,-13,13
|
||||
379,-13,13
|
||||
380,-13,13
|
||||
381,-13,13
|
||||
382,-13,13
|
||||
383,-13,13
|
||||
384,-13,13
|
||||
385,-13,13
|
||||
386,-13,13
|
||||
387,-13,13
|
||||
388,-13,13
|
||||
389,-121,22
|
||||
390,-13,13
|
||||
391,-13,13
|
||||
392,-13,13
|
||||
393,-13,13
|
||||
394,-13,13
|
||||
395,-13,13
|
||||
396,-13,13
|
||||
397,-13,13
|
||||
398,-17,17
|
||||
399,-13,13
|
||||
|
|
Before Width: | Height: | Size: 22 KiB |
|
Before Width: | Height: | Size: 53 KiB |
@@ -1,801 +0,0 @@
|
||||
episodes,rewards,steps
|
||||
0,0.0,20
|
||||
1,0.0,14
|
||||
2,0.0,13
|
||||
3,0.0,9
|
||||
4,0.0,10
|
||||
5,0.0,6
|
||||
6,0.0,11
|
||||
7,0.0,6
|
||||
8,0.0,3
|
||||
9,0.0,9
|
||||
10,0.0,11
|
||||
11,0.0,22
|
||||
12,0.0,5
|
||||
13,0.0,16
|
||||
14,0.0,4
|
||||
15,0.0,9
|
||||
16,0.0,18
|
||||
17,0.0,2
|
||||
18,0.0,4
|
||||
19,0.0,8
|
||||
20,0.0,7
|
||||
21,0.0,4
|
||||
22,0.0,22
|
||||
23,0.0,15
|
||||
24,0.0,5
|
||||
25,0.0,16
|
||||
26,0.0,7
|
||||
27,0.0,19
|
||||
28,0.0,22
|
||||
29,0.0,16
|
||||
30,0.0,11
|
||||
31,0.0,22
|
||||
32,0.0,28
|
||||
33,0.0,23
|
||||
34,0.0,4
|
||||
35,0.0,11
|
||||
36,0.0,8
|
||||
37,0.0,15
|
||||
38,0.0,5
|
||||
39,0.0,7
|
||||
40,0.0,9
|
||||
41,0.0,4
|
||||
42,0.0,3
|
||||
43,0.0,6
|
||||
44,0.0,41
|
||||
45,0.0,9
|
||||
46,0.0,23
|
||||
47,0.0,3
|
||||
48,1.0,38
|
||||
49,0.0,29
|
||||
50,0.0,17
|
||||
51,0.0,4
|
||||
52,0.0,2
|
||||
53,0.0,25
|
||||
54,0.0,6
|
||||
55,0.0,2
|
||||
56,0.0,30
|
||||
57,0.0,6
|
||||
58,0.0,7
|
||||
59,0.0,11
|
||||
60,0.0,9
|
||||
61,0.0,8
|
||||
62,0.0,23
|
||||
63,0.0,10
|
||||
64,0.0,3
|
||||
65,0.0,5
|
||||
66,0.0,7
|
||||
67,0.0,18
|
||||
68,0.0,8
|
||||
69,0.0,26
|
||||
70,0.0,6
|
||||
71,0.0,14
|
||||
72,0.0,4
|
||||
73,0.0,25
|
||||
74,0.0,21
|
||||
75,0.0,13
|
||||
76,0.0,4
|
||||
77,0.0,29
|
||||
78,0.0,21
|
||||
79,0.0,6
|
||||
80,0.0,6
|
||||
81,0.0,11
|
||||
82,0.0,21
|
||||
83,0.0,9
|
||||
84,0.0,9
|
||||
85,0.0,7
|
||||
86,0.0,48
|
||||
87,0.0,23
|
||||
88,0.0,100
|
||||
89,0.0,60
|
||||
90,0.0,7
|
||||
91,0.0,10
|
||||
92,0.0,24
|
||||
93,0.0,4
|
||||
94,0.0,7
|
||||
95,0.0,17
|
||||
96,0.0,87
|
||||
97,0.0,28
|
||||
98,0.0,7
|
||||
99,0.0,5
|
||||
100,0.0,12
|
||||
101,0.0,14
|
||||
102,0.0,6
|
||||
103,0.0,13
|
||||
104,0.0,93
|
||||
105,0.0,4
|
||||
106,0.0,50
|
||||
107,0.0,8
|
||||
108,0.0,12
|
||||
109,0.0,43
|
||||
110,0.0,30
|
||||
111,0.0,15
|
||||
112,0.0,19
|
||||
113,0.0,100
|
||||
114,0.0,82
|
||||
115,0.0,40
|
||||
116,0.0,88
|
||||
117,0.0,19
|
||||
118,0.0,30
|
||||
119,0.0,27
|
||||
120,0.0,5
|
||||
121,0.0,87
|
||||
122,0.0,9
|
||||
123,0.0,64
|
||||
124,0.0,27
|
||||
125,0.0,68
|
||||
126,0.0,81
|
||||
127,0.0,86
|
||||
128,0.0,100
|
||||
129,0.0,100
|
||||
130,0.0,27
|
||||
131,0.0,41
|
||||
132,0.0,70
|
||||
133,0.0,27
|
||||
134,0.0,6
|
||||
135,0.0,18
|
||||
136,0.0,38
|
||||
137,0.0,26
|
||||
138,0.0,36
|
||||
139,0.0,3
|
||||
140,0.0,61
|
||||
141,0.0,100
|
||||
142,0.0,4
|
||||
143,0.0,39
|
||||
144,0.0,18
|
||||
145,0.0,33
|
||||
146,0.0,29
|
||||
147,0.0,49
|
||||
148,0.0,88
|
||||
149,0.0,22
|
||||
150,0.0,65
|
||||
151,0.0,36
|
||||
152,0.0,30
|
||||
153,0.0,58
|
||||
154,0.0,43
|
||||
155,0.0,53
|
||||
156,0.0,43
|
||||
157,0.0,13
|
||||
158,0.0,8
|
||||
159,0.0,39
|
||||
160,0.0,29
|
||||
161,0.0,26
|
||||
162,0.0,60
|
||||
163,0.0,100
|
||||
164,0.0,31
|
||||
165,0.0,22
|
||||
166,0.0,100
|
||||
167,0.0,46
|
||||
168,0.0,23
|
||||
169,0.0,54
|
||||
170,0.0,8
|
||||
171,0.0,58
|
||||
172,0.0,3
|
||||
173,0.0,47
|
||||
174,0.0,16
|
||||
175,0.0,21
|
||||
176,0.0,44
|
||||
177,0.0,29
|
||||
178,0.0,100
|
||||
179,0.0,100
|
||||
180,0.0,62
|
||||
181,0.0,83
|
||||
182,0.0,26
|
||||
183,0.0,24
|
||||
184,0.0,10
|
||||
185,0.0,12
|
||||
186,0.0,40
|
||||
187,0.0,25
|
||||
188,0.0,18
|
||||
189,0.0,60
|
||||
190,0.0,100
|
||||
191,0.0,100
|
||||
192,0.0,24
|
||||
193,0.0,56
|
||||
194,0.0,71
|
||||
195,0.0,19
|
||||
196,0.0,100
|
||||
197,0.0,44
|
||||
198,0.0,41
|
||||
199,0.0,41
|
||||
200,0.0,60
|
||||
201,0.0,31
|
||||
202,0.0,34
|
||||
203,0.0,35
|
||||
204,0.0,59
|
||||
205,0.0,51
|
||||
206,0.0,100
|
||||
207,0.0,100
|
||||
208,0.0,100
|
||||
209,0.0,100
|
||||
210,0.0,37
|
||||
211,0.0,68
|
||||
212,0.0,40
|
||||
213,0.0,17
|
||||
214,0.0,79
|
||||
215,0.0,100
|
||||
216,0.0,26
|
||||
217,0.0,61
|
||||
218,0.0,25
|
||||
219,0.0,18
|
||||
220,0.0,27
|
||||
221,0.0,13
|
||||
222,0.0,100
|
||||
223,0.0,87
|
||||
224,0.0,100
|
||||
225,0.0,92
|
||||
226,0.0,100
|
||||
227,0.0,8
|
||||
228,0.0,100
|
||||
229,0.0,64
|
||||
230,0.0,17
|
||||
231,0.0,82
|
||||
232,0.0,100
|
||||
233,0.0,94
|
||||
234,0.0,7
|
||||
235,0.0,36
|
||||
236,0.0,100
|
||||
237,0.0,56
|
||||
238,0.0,17
|
||||
239,0.0,100
|
||||
240,0.0,83
|
||||
241,0.0,100
|
||||
242,0.0,100
|
||||
243,0.0,43
|
||||
244,0.0,87
|
||||
245,0.0,42
|
||||
246,0.0,80
|
||||
247,0.0,54
|
||||
248,0.0,82
|
||||
249,0.0,97
|
||||
250,0.0,65
|
||||
251,0.0,83
|
||||
252,0.0,100
|
||||
253,0.0,59
|
||||
254,0.0,100
|
||||
255,0.0,78
|
||||
256,0.0,100
|
||||
257,0.0,100
|
||||
258,0.0,43
|
||||
259,0.0,80
|
||||
260,0.0,100
|
||||
261,0.0,70
|
||||
262,0.0,94
|
||||
263,0.0,100
|
||||
264,0.0,100
|
||||
265,0.0,37
|
||||
266,0.0,11
|
||||
267,0.0,31
|
||||
268,0.0,100
|
||||
269,0.0,34
|
||||
270,0.0,32
|
||||
271,0.0,58
|
||||
272,0.0,38
|
||||
273,0.0,28
|
||||
274,0.0,100
|
||||
275,0.0,59
|
||||
276,0.0,100
|
||||
277,0.0,82
|
||||
278,0.0,51
|
||||
279,0.0,25
|
||||
280,0.0,73
|
||||
281,0.0,56
|
||||
282,0.0,55
|
||||
283,0.0,38
|
||||
284,0.0,100
|
||||
285,0.0,100
|
||||
286,0.0,92
|
||||
287,0.0,100
|
||||
288,0.0,100
|
||||
289,0.0,100
|
||||
290,0.0,37
|
||||
291,0.0,100
|
||||
292,0.0,66
|
||||
293,0.0,24
|
||||
294,0.0,17
|
||||
295,0.0,100
|
||||
296,0.0,59
|
||||
297,0.0,25
|
||||
298,0.0,73
|
||||
299,0.0,100
|
||||
300,0.0,29
|
||||
301,0.0,100
|
||||
302,0.0,72
|
||||
303,0.0,6
|
||||
304,1.0,57
|
||||
305,0.0,47
|
||||
306,0.0,48
|
||||
307,0.0,13
|
||||
308,0.0,100
|
||||
309,0.0,38
|
||||
310,0.0,100
|
||||
311,0.0,20
|
||||
312,0.0,100
|
||||
313,0.0,100
|
||||
314,0.0,5
|
||||
315,0.0,39
|
||||
316,0.0,11
|
||||
317,0.0,83
|
||||
318,0.0,42
|
||||
319,0.0,100
|
||||
320,0.0,99
|
||||
321,0.0,83
|
||||
322,0.0,28
|
||||
323,0.0,46
|
||||
324,0.0,100
|
||||
325,0.0,100
|
||||
326,0.0,62
|
||||
327,0.0,100
|
||||
328,0.0,23
|
||||
329,0.0,91
|
||||
330,0.0,53
|
||||
331,0.0,19
|
||||
332,0.0,26
|
||||
333,0.0,93
|
||||
334,0.0,38
|
||||
335,0.0,22
|
||||
336,0.0,43
|
||||
337,0.0,100
|
||||
338,0.0,90
|
||||
339,0.0,18
|
||||
340,0.0,45
|
||||
341,0.0,65
|
||||
342,1.0,22
|
||||
343,0.0,100
|
||||
344,1.0,15
|
||||
345,1.0,72
|
||||
346,0.0,5
|
||||
347,1.0,6
|
||||
348,1.0,6
|
||||
349,1.0,9
|
||||
350,1.0,8
|
||||
351,1.0,9
|
||||
352,1.0,8
|
||||
353,1.0,6
|
||||
354,1.0,6
|
||||
355,1.0,10
|
||||
356,1.0,6
|
||||
357,0.0,5
|
||||
358,0.0,3
|
||||
359,1.0,6
|
||||
360,1.0,6
|
||||
361,1.0,6
|
||||
362,1.0,6
|
||||
363,1.0,8
|
||||
364,1.0,6
|
||||
365,1.0,8
|
||||
366,1.0,6
|
||||
367,1.0,6
|
||||
368,1.0,8
|
||||
369,1.0,6
|
||||
370,1.0,6
|
||||
371,0.0,5
|
||||
372,1.0,6
|
||||
373,0.0,6
|
||||
374,1.0,6
|
||||
375,1.0,12
|
||||
376,1.0,6
|
||||
377,1.0,6
|
||||
378,1.0,9
|
||||
379,1.0,6
|
||||
380,1.0,6
|
||||
381,0.0,2
|
||||
382,0.0,3
|
||||
383,0.0,2
|
||||
384,0.0,4
|
||||
385,0.0,3
|
||||
386,1.0,7
|
||||
387,1.0,6
|
||||
388,1.0,6
|
||||
389,1.0,8
|
||||
390,1.0,9
|
||||
391,1.0,8
|
||||
392,1.0,8
|
||||
393,1.0,6
|
||||
394,1.0,6
|
||||
395,1.0,7
|
||||
396,1.0,6
|
||||
397,0.0,5
|
||||
398,0.0,5
|
||||
399,1.0,10
|
||||
400,1.0,6
|
||||
401,0.0,3
|
||||
402,1.0,6
|
||||
403,1.0,7
|
||||
404,1.0,6
|
||||
405,1.0,6
|
||||
406,1.0,6
|
||||
407,1.0,6
|
||||
408,1.0,6
|
||||
409,1.0,6
|
||||
410,1.0,6
|
||||
411,0.0,5
|
||||
412,1.0,6
|
||||
413,1.0,6
|
||||
414,0.0,2
|
||||
415,1.0,6
|
||||
416,1.0,6
|
||||
417,1.0,6
|
||||
418,1.0,6
|
||||
419,1.0,6
|
||||
420,1.0,8
|
||||
421,1.0,6
|
||||
422,1.0,6
|
||||
423,1.0,6
|
||||
424,1.0,6
|
||||
425,1.0,7
|
||||
426,0.0,5
|
||||
427,1.0,6
|
||||
428,1.0,6
|
||||
429,1.0,6
|
||||
430,1.0,8
|
||||
431,1.0,6
|
||||
432,1.0,6
|
||||
433,1.0,6
|
||||
434,1.0,6
|
||||
435,0.0,2
|
||||
436,1.0,8
|
||||
437,1.0,7
|
||||
438,1.0,6
|
||||
439,1.0,7
|
||||
440,1.0,6
|
||||
441,1.0,6
|
||||
442,0.0,3
|
||||
443,0.0,4
|
||||
444,1.0,6
|
||||
445,1.0,6
|
||||
446,1.0,7
|
||||
447,1.0,6
|
||||
448,1.0,6
|
||||
449,1.0,6
|
||||
450,1.0,6
|
||||
451,1.0,6
|
||||
452,1.0,6
|
||||
453,1.0,8
|
||||
454,1.0,6
|
||||
455,1.0,6
|
||||
456,1.0,6
|
||||
457,1.0,6
|
||||
458,1.0,6
|
||||
459,1.0,7
|
||||
460,1.0,8
|
||||
461,1.0,6
|
||||
462,1.0,7
|
||||
463,1.0,6
|
||||
464,1.0,6
|
||||
465,1.0,6
|
||||
466,1.0,6
|
||||
467,1.0,8
|
||||
468,1.0,6
|
||||
469,1.0,6
|
||||
470,1.0,8
|
||||
471,1.0,6
|
||||
472,1.0,11
|
||||
473,1.0,6
|
||||
474,1.0,6
|
||||
475,1.0,6
|
||||
476,1.0,8
|
||||
477,0.0,2
|
||||
478,1.0,7
|
||||
479,1.0,6
|
||||
480,1.0,6
|
||||
481,1.0,7
|
||||
482,1.0,6
|
||||
483,1.0,6
|
||||
484,1.0,6
|
||||
485,1.0,6
|
||||
486,0.0,3
|
||||
487,1.0,7
|
||||
488,1.0,6
|
||||
489,1.0,6
|
||||
490,1.0,6
|
||||
491,0.0,3
|
||||
492,1.0,6
|
||||
493,1.0,7
|
||||
494,1.0,12
|
||||
495,1.0,6
|
||||
496,0.0,9
|
||||
497,1.0,6
|
||||
498,1.0,6
|
||||
499,0.0,8
|
||||
500,1.0,6
|
||||
501,0.0,3
|
||||
502,0.0,5
|
||||
503,0.0,3
|
||||
504,1.0,6
|
||||
505,1.0,6
|
||||
506,1.0,6
|
||||
507,1.0,6
|
||||
508,1.0,6
|
||||
509,1.0,6
|
||||
510,1.0,6
|
||||
511,1.0,6
|
||||
512,1.0,6
|
||||
513,1.0,6
|
||||
514,0.0,2
|
||||
515,1.0,7
|
||||
516,1.0,6
|
||||
517,1.0,6
|
||||
518,1.0,6
|
||||
519,1.0,6
|
||||
520,1.0,6
|
||||
521,1.0,7
|
||||
522,0.0,4
|
||||
523,1.0,6
|
||||
524,0.0,5
|
||||
525,1.0,6
|
||||
526,1.0,6
|
||||
527,1.0,6
|
||||
528,1.0,6
|
||||
529,0.0,3
|
||||
530,1.0,6
|
||||
531,1.0,6
|
||||
532,1.0,6
|
||||
533,1.0,7
|
||||
534,1.0,8
|
||||
535,1.0,6
|
||||
536,1.0,6
|
||||
537,1.0,6
|
||||
538,1.0,6
|
||||
539,1.0,7
|
||||
540,1.0,7
|
||||
541,1.0,7
|
||||
542,1.0,8
|
||||
543,1.0,6
|
||||
544,1.0,10
|
||||
545,1.0,6
|
||||
546,1.0,6
|
||||
547,1.0,6
|
||||
548,1.0,8
|
||||
549,1.0,6
|
||||
550,1.0,6
|
||||
551,1.0,8
|
||||
552,1.0,6
|
||||
553,1.0,7
|
||||
554,1.0,6
|
||||
555,1.0,7
|
||||
556,1.0,6
|
||||
557,1.0,6
|
||||
558,1.0,7
|
||||
559,1.0,7
|
||||
560,1.0,7
|
||||
561,1.0,6
|
||||
562,1.0,6
|
||||
563,1.0,6
|
||||
564,1.0,6
|
||||
565,1.0,6
|
||||
566,1.0,6
|
||||
567,1.0,6
|
||||
568,1.0,7
|
||||
569,0.0,4
|
||||
570,1.0,8
|
||||
571,1.0,8
|
||||
572,1.0,7
|
||||
573,1.0,6
|
||||
574,1.0,8
|
||||
575,1.0,6
|
||||
576,1.0,6
|
||||
577,1.0,7
|
||||
578,1.0,6
|
||||
579,1.0,6
|
||||
580,1.0,8
|
||||
581,1.0,7
|
||||
582,1.0,6
|
||||
583,1.0,6
|
||||
584,0.0,3
|
||||
585,1.0,11
|
||||
586,1.0,6
|
||||
587,1.0,8
|
||||
588,0.0,2
|
||||
589,1.0,6
|
||||
590,1.0,6
|
||||
591,1.0,6
|
||||
592,1.0,6
|
||||
593,1.0,8
|
||||
594,1.0,6
|
||||
595,1.0,7
|
||||
596,1.0,6
|
||||
597,1.0,7
|
||||
598,1.0,6
|
||||
599,1.0,8
|
||||
600,0.0,2
|
||||
601,1.0,6
|
||||
602,1.0,7
|
||||
603,1.0,6
|
||||
604,1.0,6
|
||||
605,1.0,10
|
||||
606,1.0,7
|
||||
607,1.0,6
|
||||
608,1.0,6
|
||||
609,1.0,6
|
||||
610,1.0,6
|
||||
611,1.0,6
|
||||
612,1.0,7
|
||||
613,0.0,4
|
||||
614,1.0,7
|
||||
615,1.0,6
|
||||
616,1.0,8
|
||||
617,0.0,3
|
||||
618,1.0,6
|
||||
619,1.0,6
|
||||
620,1.0,6
|
||||
621,1.0,6
|
||||
622,0.0,2
|
||||
623,1.0,6
|
||||
624,1.0,6
|
||||
625,1.0,6
|
||||
626,1.0,6
|
||||
627,1.0,6
|
||||
628,1.0,7
|
||||
629,1.0,6
|
||||
630,1.0,6
|
||||
631,1.0,7
|
||||
632,1.0,6
|
||||
633,1.0,6
|
||||
634,1.0,6
|
||||
635,1.0,6
|
||||
636,1.0,6
|
||||
637,1.0,6
|
||||
638,1.0,6
|
||||
639,1.0,8
|
||||
640,1.0,6
|
||||
641,1.0,8
|
||||
642,1.0,7
|
||||
643,1.0,6
|
||||
644,0.0,3
|
||||
645,1.0,6
|
||||
646,1.0,7
|
||||
647,1.0,6
|
||||
648,1.0,6
|
||||
649,1.0,6
|
||||
650,1.0,10
|
||||
651,1.0,6
|
||||
652,1.0,6
|
||||
653,1.0,6
|
||||
654,1.0,6
|
||||
655,1.0,10
|
||||
656,1.0,6
|
||||
657,1.0,8
|
||||
658,1.0,8
|
||||
659,1.0,7
|
||||
660,1.0,6
|
||||
661,0.0,5
|
||||
662,0.0,2
|
||||
663,1.0,8
|
||||
664,1.0,6
|
||||
665,1.0,10
|
||||
666,1.0,6
|
||||
667,1.0,8
|
||||
668,1.0,10
|
||||
669,1.0,6
|
||||
670,1.0,6
|
||||
671,1.0,6
|
||||
672,1.0,10
|
||||
673,1.0,6
|
||||
674,0.0,4
|
||||
675,1.0,6
|
||||
676,1.0,6
|
||||
677,1.0,6
|
||||
678,1.0,15
|
||||
679,1.0,6
|
||||
680,1.0,6
|
||||
681,1.0,6
|
||||
682,1.0,6
|
||||
683,1.0,6
|
||||
684,1.0,6
|
||||
685,1.0,8
|
||||
686,1.0,6
|
||||
687,1.0,7
|
||||
688,1.0,6
|
||||
689,1.0,6
|
||||
690,1.0,8
|
||||
691,1.0,6
|
||||
692,1.0,6
|
||||
693,1.0,8
|
||||
694,1.0,8
|
||||
695,1.0,6
|
||||
696,1.0,6
|
||||
697,1.0,6
|
||||
698,1.0,10
|
||||
699,1.0,6
|
||||
700,1.0,6
|
||||
701,1.0,6
|
||||
702,1.0,6
|
||||
703,1.0,6
|
||||
704,1.0,6
|
||||
705,1.0,6
|
||||
706,1.0,8
|
||||
707,1.0,8
|
||||
708,1.0,6
|
||||
709,1.0,6
|
||||
710,0.0,2
|
||||
711,1.0,6
|
||||
712,1.0,6
|
||||
713,1.0,6
|
||||
714,1.0,8
|
||||
715,1.0,6
|
||||
716,1.0,6
|
||||
717,1.0,6
|
||||
718,1.0,6
|
||||
719,1.0,6
|
||||
720,1.0,6
|
||||
721,1.0,6
|
||||
722,1.0,6
|
||||
723,1.0,6
|
||||
724,1.0,7
|
||||
725,0.0,3
|
||||
726,1.0,7
|
||||
727,1.0,6
|
||||
728,1.0,6
|
||||
729,1.0,6
|
||||
730,0.0,2
|
||||
731,1.0,6
|
||||
732,1.0,8
|
||||
733,1.0,6
|
||||
734,1.0,6
|
||||
735,1.0,6
|
||||
736,1.0,6
|
||||
737,1.0,9
|
||||
738,1.0,6
|
||||
739,1.0,6
|
||||
740,1.0,6
|
||||
741,1.0,6
|
||||
742,1.0,6
|
||||
743,1.0,6
|
||||
744,1.0,9
|
||||
745,1.0,7
|
||||
746,0.0,4
|
||||
747,1.0,6
|
||||
748,1.0,8
|
||||
749,1.0,11
|
||||
750,1.0,6
|
||||
751,1.0,6
|
||||
752,1.0,6
|
||||
753,1.0,6
|
||||
754,1.0,6
|
||||
755,1.0,8
|
||||
756,1.0,6
|
||||
757,1.0,6
|
||||
758,1.0,8
|
||||
759,1.0,7
|
||||
760,1.0,6
|
||||
761,1.0,8
|
||||
762,1.0,6
|
||||
763,0.0,5
|
||||
764,1.0,9
|
||||
765,1.0,8
|
||||
766,1.0,8
|
||||
767,1.0,6
|
||||
768,1.0,8
|
||||
769,1.0,8
|
||||
770,1.0,6
|
||||
771,0.0,5
|
||||
772,0.0,3
|
||||
773,0.0,2
|
||||
774,1.0,8
|
||||
775,1.0,6
|
||||
776,1.0,6
|
||||
777,1.0,6
|
||||
778,1.0,6
|
||||
779,1.0,6
|
||||
780,1.0,6
|
||||
781,1.0,6
|
||||
782,1.0,6
|
||||
783,1.0,6
|
||||
784,1.0,6
|
||||
785,1.0,6
|
||||
786,1.0,6
|
||||
787,1.0,6
|
||||
788,1.0,6
|
||||
789,0.0,2
|
||||
790,1.0,6
|
||||
791,0.0,4
|
||||
792,1.0,6
|
||||
793,1.0,6
|
||||
794,1.0,6
|
||||
795,1.0,6
|
||||
796,1.0,6
|
||||
797,1.0,8
|
||||
798,0.0,5
|
||||
799,1.0,6
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"algo_name": "Q-learning",
|
||||
"env_name": "FrozenLake-v1",
|
||||
"env_name": "FrozenLakeNoSlippery-v1",
|
||||
"train_eps": 800,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.9,
|
||||
@@ -12,8 +12,8 @@
|
||||
"seed": 10,
|
||||
"show_fig": false,
|
||||
"save_fig": true,
|
||||
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLake-v1/20220824-112735/results/",
|
||||
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLake-v1/20220824-112735/models/",
|
||||
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLakeNoSlippery-v1/20220825-114335/results/",
|
||||
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/FrozenLakeNoSlippery-v1/20220825-114335/models/",
|
||||
"n_states": 16,
|
||||
"n_actions": 4
|
||||
}
|
||||
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 55 KiB |
@@ -0,0 +1,801 @@
|
||||
episodes,rewards,steps
|
||||
0,0.0,20
|
||||
1,0.0,14
|
||||
2,0.0,13
|
||||
3,0.0,9
|
||||
4,0.0,10
|
||||
5,0.0,6
|
||||
6,0.0,11
|
||||
7,0.0,6
|
||||
8,0.0,3
|
||||
9,0.0,9
|
||||
10,0.0,11
|
||||
11,0.0,22
|
||||
12,0.0,5
|
||||
13,0.0,16
|
||||
14,0.0,4
|
||||
15,0.0,9
|
||||
16,0.0,18
|
||||
17,0.0,2
|
||||
18,0.0,4
|
||||
19,0.0,8
|
||||
20,0.0,7
|
||||
21,0.0,4
|
||||
22,0.0,22
|
||||
23,0.0,15
|
||||
24,0.0,5
|
||||
25,0.0,16
|
||||
26,0.0,7
|
||||
27,0.0,19
|
||||
28,0.0,22
|
||||
29,0.0,16
|
||||
30,0.0,11
|
||||
31,0.0,22
|
||||
32,0.0,28
|
||||
33,0.0,23
|
||||
34,0.0,4
|
||||
35,0.0,11
|
||||
36,0.0,8
|
||||
37,0.0,15
|
||||
38,0.0,5
|
||||
39,0.0,7
|
||||
40,0.0,9
|
||||
41,0.0,4
|
||||
42,0.0,3
|
||||
43,0.0,6
|
||||
44,0.0,41
|
||||
45,0.0,9
|
||||
46,0.0,23
|
||||
47,0.0,3
|
||||
48,1.0,38
|
||||
49,0.0,29
|
||||
50,0.0,17
|
||||
51,0.0,4
|
||||
52,0.0,2
|
||||
53,0.0,25
|
||||
54,0.0,6
|
||||
55,0.0,2
|
||||
56,0.0,30
|
||||
57,0.0,6
|
||||
58,0.0,7
|
||||
59,0.0,11
|
||||
60,0.0,9
|
||||
61,0.0,8
|
||||
62,0.0,23
|
||||
63,0.0,10
|
||||
64,0.0,3
|
||||
65,0.0,5
|
||||
66,0.0,7
|
||||
67,0.0,18
|
||||
68,0.0,8
|
||||
69,0.0,26
|
||||
70,0.0,6
|
||||
71,0.0,14
|
||||
72,0.0,4
|
||||
73,0.0,25
|
||||
74,0.0,21
|
||||
75,0.0,13
|
||||
76,0.0,4
|
||||
77,0.0,29
|
||||
78,0.0,21
|
||||
79,0.0,6
|
||||
80,0.0,6
|
||||
81,0.0,11
|
||||
82,0.0,21
|
||||
83,0.0,9
|
||||
84,0.0,9
|
||||
85,0.0,7
|
||||
86,0.0,48
|
||||
87,0.0,23
|
||||
88,0.0,160
|
||||
89,0.0,7
|
||||
90,0.0,10
|
||||
91,0.0,24
|
||||
92,0.0,4
|
||||
93,0.0,7
|
||||
94,0.0,17
|
||||
95,0.0,87
|
||||
96,0.0,28
|
||||
97,0.0,7
|
||||
98,0.0,5
|
||||
99,0.0,12
|
||||
100,0.0,14
|
||||
101,0.0,6
|
||||
102,0.0,13
|
||||
103,0.0,93
|
||||
104,0.0,4
|
||||
105,0.0,50
|
||||
106,0.0,8
|
||||
107,0.0,12
|
||||
108,0.0,43
|
||||
109,0.0,30
|
||||
110,0.0,15
|
||||
111,0.0,19
|
||||
112,0.0,182
|
||||
113,0.0,40
|
||||
114,0.0,88
|
||||
115,0.0,19
|
||||
116,0.0,30
|
||||
117,0.0,27
|
||||
118,0.0,5
|
||||
119,0.0,87
|
||||
120,0.0,9
|
||||
121,0.0,64
|
||||
122,0.0,27
|
||||
123,0.0,68
|
||||
124,0.0,81
|
||||
125,0.0,86
|
||||
126,0.0,227
|
||||
127,0.0,41
|
||||
128,0.0,70
|
||||
129,0.0,27
|
||||
130,0.0,6
|
||||
131,0.0,18
|
||||
132,0.0,38
|
||||
133,0.0,26
|
||||
134,0.0,36
|
||||
135,0.0,3
|
||||
136,0.0,61
|
||||
137,0.0,105
|
||||
138,0.0,38
|
||||
139,0.0,18
|
||||
140,0.0,33
|
||||
141,0.0,29
|
||||
142,0.0,49
|
||||
143,0.0,88
|
||||
144,0.0,22
|
||||
145,0.0,65
|
||||
146,0.0,36
|
||||
147,0.0,30
|
||||
148,0.0,58
|
||||
149,0.0,43
|
||||
150,0.0,53
|
||||
151,0.0,43
|
||||
152,0.0,13
|
||||
153,0.0,8
|
||||
154,0.0,39
|
||||
155,0.0,29
|
||||
156,0.0,26
|
||||
157,0.0,60
|
||||
158,0.0,153
|
||||
159,0.0,116
|
||||
160,0.0,53
|
||||
161,0.0,54
|
||||
162,0.0,8
|
||||
163,0.0,58
|
||||
164,0.0,3
|
||||
165,0.0,47
|
||||
166,0.0,16
|
||||
167,0.0,21
|
||||
168,0.0,44
|
||||
169,0.0,29
|
||||
170,0.0,104
|
||||
171,0.0,158
|
||||
172,0.0,83
|
||||
173,0.0,26
|
||||
174,0.0,24
|
||||
175,0.0,10
|
||||
176,0.0,12
|
||||
177,0.0,40
|
||||
178,0.0,25
|
||||
179,0.0,18
|
||||
180,0.0,60
|
||||
181,0.0,203
|
||||
182,0.0,23
|
||||
183,0.0,54
|
||||
184,0.0,71
|
||||
185,0.0,19
|
||||
186,0.0,118
|
||||
187,0.0,26
|
||||
188,0.0,41
|
||||
189,0.0,41
|
||||
190,0.0,60
|
||||
191,0.0,31
|
||||
192,0.0,34
|
||||
193,0.0,35
|
||||
194,0.0,59
|
||||
195,0.0,51
|
||||
196,0.0,426
|
||||
197,0.0,79
|
||||
198,0.0,40
|
||||
199,0.0,17
|
||||
200,0.0,79
|
||||
201,0.0,126
|
||||
202,0.0,61
|
||||
203,0.0,25
|
||||
204,0.0,18
|
||||
205,0.0,27
|
||||
206,0.0,13
|
||||
207,0.0,187
|
||||
208,0.0,160
|
||||
209,0.0,32
|
||||
210,0.0,108
|
||||
211,0.0,164
|
||||
212,0.0,17
|
||||
213,0.0,82
|
||||
214,0.0,194
|
||||
215,0.0,7
|
||||
216,0.0,36
|
||||
217,0.0,156
|
||||
218,0.0,17
|
||||
219,0.0,183
|
||||
220,0.0,243
|
||||
221,0.0,87
|
||||
222,0.0,42
|
||||
223,0.0,80
|
||||
224,0.0,54
|
||||
225,0.0,82
|
||||
226,0.0,97
|
||||
227,0.0,65
|
||||
228,0.0,83
|
||||
229,0.0,159
|
||||
230,0.0,178
|
||||
231,0.0,104
|
||||
232,0.0,21
|
||||
233,0.0,118
|
||||
234,0.0,80
|
||||
235,0.0,170
|
||||
236,0.0,94
|
||||
237,0.0,235
|
||||
238,0.0,13
|
||||
239,0.0,31
|
||||
240,0.0,134
|
||||
241,0.0,32
|
||||
242,0.0,58
|
||||
243,0.0,38
|
||||
244,0.0,28
|
||||
245,0.0,159
|
||||
246,0.0,182
|
||||
247,0.0,51
|
||||
248,0.0,25
|
||||
249,0.0,73
|
||||
250,0.0,56
|
||||
251,0.0,55
|
||||
252,0.0,38
|
||||
253,0.0,292
|
||||
254,0.0,319
|
||||
255,0.0,100
|
||||
256,0.0,84
|
||||
257,0.0,24
|
||||
258,0.0,17
|
||||
259,0.0,159
|
||||
260,0.0,25
|
||||
261,0.0,73
|
||||
262,0.0,130
|
||||
263,0.0,111
|
||||
264,0.0,65
|
||||
265,1.0,58
|
||||
266,0.0,47
|
||||
267,0.0,48
|
||||
268,0.0,13
|
||||
269,0.0,100
|
||||
270,0.0,38
|
||||
271,0.0,111
|
||||
272,0.0,226
|
||||
273,0.0,38
|
||||
274,0.0,83
|
||||
275,0.0,42
|
||||
276,0.0,199
|
||||
277,0.0,83
|
||||
278,0.0,28
|
||||
279,0.0,46
|
||||
280,0.0,262
|
||||
281,0.0,123
|
||||
282,0.0,91
|
||||
283,0.0,53
|
||||
284,0.0,19
|
||||
285,0.0,26
|
||||
286,0.0,93
|
||||
287,0.0,38
|
||||
288,0.0,22
|
||||
289,0.0,43
|
||||
290,0.0,163
|
||||
291,0.0,25
|
||||
292,0.0,59
|
||||
293,0.0,71
|
||||
294,0.0,20
|
||||
295,0.0,115
|
||||
296,0.0,248
|
||||
297,0.0,66
|
||||
298,0.0,58
|
||||
299,0.0,129
|
||||
300,0.0,122
|
||||
301,0.0,47
|
||||
302,0.0,60
|
||||
303,0.0,79
|
||||
304,1.0,137
|
||||
305,0.0,27
|
||||
306,1.0,93
|
||||
307,0.0,46
|
||||
308,1.0,83
|
||||
309,1.0,8
|
||||
310,1.0,6
|
||||
311,1.0,6
|
||||
312,0.0,4
|
||||
313,1.0,6
|
||||
314,0.0,2
|
||||
315,1.0,6
|
||||
316,1.0,6
|
||||
317,1.0,6
|
||||
318,1.0,6
|
||||
319,1.0,8
|
||||
320,0.0,5
|
||||
321,1.0,6
|
||||
322,1.0,7
|
||||
323,0.0,5
|
||||
324,1.0,6
|
||||
325,1.0,6
|
||||
326,1.0,8
|
||||
327,1.0,6
|
||||
328,1.0,6
|
||||
329,1.0,6
|
||||
330,1.0,7
|
||||
331,1.0,6
|
||||
332,1.0,6
|
||||
333,0.0,3
|
||||
334,1.0,7
|
||||
335,0.0,4
|
||||
336,1.0,6
|
||||
337,1.0,6
|
||||
338,1.0,7
|
||||
339,1.0,6
|
||||
340,1.0,6
|
||||
341,1.0,7
|
||||
342,1.0,7
|
||||
343,1.0,7
|
||||
344,1.0,6
|
||||
345,1.0,6
|
||||
346,1.0,6
|
||||
347,1.0,6
|
||||
348,1.0,6
|
||||
349,1.0,6
|
||||
350,1.0,6
|
||||
351,1.0,7
|
||||
352,0.0,4
|
||||
353,1.0,8
|
||||
354,1.0,8
|
||||
355,1.0,7
|
||||
356,1.0,6
|
||||
357,1.0,8
|
||||
358,1.0,6
|
||||
359,1.0,6
|
||||
360,1.0,7
|
||||
361,1.0,6
|
||||
362,1.0,6
|
||||
363,1.0,8
|
||||
364,1.0,7
|
||||
365,1.0,6
|
||||
366,1.0,6
|
||||
367,0.0,3
|
||||
368,1.0,11
|
||||
369,1.0,6
|
||||
370,1.0,8
|
||||
371,0.0,2
|
||||
372,1.0,6
|
||||
373,1.0,6
|
||||
374,1.0,6
|
||||
375,1.0,6
|
||||
376,1.0,8
|
||||
377,1.0,6
|
||||
378,1.0,7
|
||||
379,1.0,6
|
||||
380,1.0,7
|
||||
381,1.0,6
|
||||
382,1.0,8
|
||||
383,0.0,2
|
||||
384,1.0,6
|
||||
385,1.0,7
|
||||
386,1.0,6
|
||||
387,1.0,6
|
||||
388,1.0,10
|
||||
389,1.0,7
|
||||
390,1.0,6
|
||||
391,1.0,6
|
||||
392,1.0,6
|
||||
393,1.0,6
|
||||
394,1.0,6
|
||||
395,1.0,7
|
||||
396,0.0,4
|
||||
397,1.0,7
|
||||
398,1.0,6
|
||||
399,1.0,8
|
||||
400,0.0,3
|
||||
401,1.0,6
|
||||
402,1.0,6
|
||||
403,1.0,6
|
||||
404,1.0,6
|
||||
405,0.0,2
|
||||
406,1.0,6
|
||||
407,1.0,6
|
||||
408,1.0,6
|
||||
409,1.0,6
|
||||
410,1.0,6
|
||||
411,1.0,7
|
||||
412,1.0,6
|
||||
413,1.0,6
|
||||
414,1.0,7
|
||||
415,1.0,6
|
||||
416,1.0,6
|
||||
417,1.0,6
|
||||
418,1.0,6
|
||||
419,1.0,6
|
||||
420,1.0,6
|
||||
421,1.0,6
|
||||
422,1.0,8
|
||||
423,1.0,6
|
||||
424,1.0,8
|
||||
425,1.0,7
|
||||
426,1.0,6
|
||||
427,0.0,3
|
||||
428,1.0,6
|
||||
429,1.0,7
|
||||
430,1.0,6
|
||||
431,1.0,6
|
||||
432,1.0,6
|
||||
433,1.0,10
|
||||
434,1.0,6
|
||||
435,1.0,6
|
||||
436,1.0,6
|
||||
437,1.0,6
|
||||
438,1.0,10
|
||||
439,1.0,6
|
||||
440,1.0,8
|
||||
441,1.0,8
|
||||
442,1.0,7
|
||||
443,1.0,6
|
||||
444,0.0,5
|
||||
445,0.0,2
|
||||
446,1.0,8
|
||||
447,1.0,6
|
||||
448,1.0,10
|
||||
449,1.0,6
|
||||
450,1.0,8
|
||||
451,1.0,10
|
||||
452,1.0,6
|
||||
453,1.0,6
|
||||
454,1.0,6
|
||||
455,1.0,10
|
||||
456,1.0,6
|
||||
457,0.0,4
|
||||
458,1.0,6
|
||||
459,1.0,6
|
||||
460,1.0,6
|
||||
461,1.0,15
|
||||
462,1.0,6
|
||||
463,1.0,6
|
||||
464,1.0,6
|
||||
465,1.0,6
|
||||
466,1.0,6
|
||||
467,1.0,6
|
||||
468,1.0,8
|
||||
469,1.0,6
|
||||
470,1.0,7
|
||||
471,1.0,6
|
||||
472,1.0,6
|
||||
473,1.0,8
|
||||
474,1.0,6
|
||||
475,1.0,6
|
||||
476,1.0,8
|
||||
477,1.0,8
|
||||
478,1.0,6
|
||||
479,1.0,6
|
||||
480,1.0,6
|
||||
481,1.0,10
|
||||
482,1.0,6
|
||||
483,1.0,6
|
||||
484,1.0,6
|
||||
485,1.0,6
|
||||
486,1.0,6
|
||||
487,1.0,6
|
||||
488,1.0,6
|
||||
489,1.0,8
|
||||
490,1.0,8
|
||||
491,1.0,6
|
||||
492,1.0,6
|
||||
493,0.0,2
|
||||
494,1.0,6
|
||||
495,1.0,6
|
||||
496,1.0,6
|
||||
497,1.0,8
|
||||
498,1.0,6
|
||||
499,1.0,6
|
||||
500,1.0,6
|
||||
501,1.0,6
|
||||
502,1.0,6
|
||||
503,1.0,6
|
||||
504,1.0,6
|
||||
505,1.0,6
|
||||
506,1.0,6
|
||||
507,1.0,7
|
||||
508,0.0,3
|
||||
509,1.0,7
|
||||
510,1.0,6
|
||||
511,1.0,6
|
||||
512,1.0,6
|
||||
513,0.0,2
|
||||
514,1.0,6
|
||||
515,1.0,8
|
||||
516,1.0,6
|
||||
517,1.0,6
|
||||
518,1.0,6
|
||||
519,1.0,6
|
||||
520,1.0,9
|
||||
521,1.0,6
|
||||
522,1.0,6
|
||||
523,1.0,6
|
||||
524,1.0,6
|
||||
525,1.0,6
|
||||
526,1.0,6
|
||||
527,1.0,9
|
||||
528,1.0,7
|
||||
529,0.0,4
|
||||
530,1.0,6
|
||||
531,1.0,8
|
||||
532,1.0,11
|
||||
533,1.0,6
|
||||
534,1.0,6
|
||||
535,1.0,6
|
||||
536,1.0,6
|
||||
537,1.0,6
|
||||
538,1.0,8
|
||||
539,1.0,6
|
||||
540,1.0,6
|
||||
541,1.0,8
|
||||
542,1.0,7
|
||||
543,1.0,6
|
||||
544,1.0,8
|
||||
545,1.0,6
|
||||
546,0.0,5
|
||||
547,1.0,9
|
||||
548,1.0,8
|
||||
549,1.0,8
|
||||
550,1.0,6
|
||||
551,1.0,8
|
||||
552,1.0,8
|
||||
553,1.0,6
|
||||
554,0.0,5
|
||||
555,0.0,3
|
||||
556,0.0,2
|
||||
557,1.0,8
|
||||
558,1.0,6
|
||||
559,1.0,6
|
||||
560,1.0,6
|
||||
561,1.0,6
|
||||
562,1.0,6
|
||||
563,1.0,6
|
||||
564,1.0,6
|
||||
565,1.0,6
|
||||
566,1.0,6
|
||||
567,1.0,6
|
||||
568,1.0,6
|
||||
569,1.0,6
|
||||
570,1.0,6
|
||||
571,1.0,6
|
||||
572,0.0,2
|
||||
573,1.0,6
|
||||
574,0.0,4
|
||||
575,1.0,6
|
||||
576,1.0,6
|
||||
577,1.0,6
|
||||
578,1.0,6
|
||||
579,1.0,6
|
||||
580,1.0,8
|
||||
581,0.0,5
|
||||
582,1.0,6
|
||||
583,1.0,6
|
||||
584,1.0,6
|
||||
585,1.0,6
|
||||
586,1.0,6
|
||||
587,1.0,6
|
||||
588,0.0,3
|
||||
589,1.0,6
|
||||
590,1.0,6
|
||||
591,1.0,6
|
||||
592,0.0,2
|
||||
593,1.0,6
|
||||
594,0.0,4
|
||||
595,1.0,6
|
||||
596,1.0,6
|
||||
597,1.0,6
|
||||
598,1.0,6
|
||||
599,1.0,8
|
||||
600,1.0,6
|
||||
601,1.0,7
|
||||
602,1.0,6
|
||||
603,1.0,7
|
||||
604,1.0,6
|
||||
605,0.0,2
|
||||
606,1.0,6
|
||||
607,1.0,6
|
||||
608,0.0,5
|
||||
609,0.0,3
|
||||
610,0.0,3
|
||||
611,1.0,6
|
||||
612,0.0,5
|
||||
613,1.0,8
|
||||
614,1.0,8
|
||||
615,1.0,6
|
||||
616,1.0,6
|
||||
617,1.0,7
|
||||
618,1.0,6
|
||||
619,1.0,6
|
||||
620,1.0,6
|
||||
621,1.0,6
|
||||
622,1.0,6
|
||||
623,1.0,8
|
||||
624,0.0,2
|
||||
625,1.0,6
|
||||
626,1.0,6
|
||||
627,1.0,6
|
||||
628,1.0,6
|
||||
629,1.0,6
|
||||
630,1.0,6
|
||||
631,1.0,6
|
||||
632,1.0,8
|
||||
633,1.0,6
|
||||
634,1.0,8
|
||||
635,1.0,6
|
||||
636,1.0,6
|
||||
637,1.0,8
|
||||
638,1.0,8
|
||||
639,0.0,5
|
||||
640,0.0,4
|
||||
641,0.0,4
|
||||
642,1.0,6
|
||||
643,1.0,6
|
||||
644,1.0,6
|
||||
645,1.0,6
|
||||
646,1.0,8
|
||||
647,1.0,6
|
||||
648,0.0,4
|
||||
649,1.0,6
|
||||
650,1.0,8
|
||||
651,1.0,6
|
||||
652,1.0,6
|
||||
653,1.0,6
|
||||
654,1.0,6
|
||||
655,1.0,6
|
||||
656,1.0,6
|
||||
657,1.0,6
|
||||
658,1.0,8
|
||||
659,1.0,8
|
||||
660,1.0,6
|
||||
661,1.0,8
|
||||
662,1.0,9
|
||||
663,1.0,6
|
||||
664,1.0,6
|
||||
665,1.0,6
|
||||
666,1.0,6
|
||||
667,1.0,10
|
||||
668,1.0,6
|
||||
669,1.0,6
|
||||
670,1.0,6
|
||||
671,1.0,11
|
||||
672,1.0,10
|
||||
673,1.0,8
|
||||
674,1.0,6
|
||||
675,1.0,6
|
||||
676,1.0,6
|
||||
677,0.0,5
|
||||
678,1.0,6
|
||||
679,0.0,2
|
||||
680,1.0,9
|
||||
681,1.0,6
|
||||
682,1.0,8
|
||||
683,1.0,7
|
||||
684,1.0,6
|
||||
685,1.0,6
|
||||
686,1.0,7
|
||||
687,0.0,3
|
||||
688,1.0,7
|
||||
689,0.0,2
|
||||
690,1.0,6
|
||||
691,1.0,6
|
||||
692,1.0,8
|
||||
693,1.0,8
|
||||
694,1.0,6
|
||||
695,1.0,6
|
||||
696,0.0,2
|
||||
697,1.0,8
|
||||
698,1.0,6
|
||||
699,1.0,8
|
||||
700,1.0,6
|
||||
701,1.0,6
|
||||
702,1.0,9
|
||||
703,1.0,6
|
||||
704,1.0,8
|
||||
705,1.0,11
|
||||
706,1.0,6
|
||||
707,1.0,6
|
||||
708,1.0,6
|
||||
709,1.0,6
|
||||
710,1.0,8
|
||||
711,1.0,6
|
||||
712,1.0,6
|
||||
713,1.0,6
|
||||
714,0.0,5
|
||||
715,1.0,6
|
||||
716,1.0,6
|
||||
717,1.0,6
|
||||
718,1.0,6
|
||||
719,1.0,6
|
||||
720,1.0,7
|
||||
721,1.0,6
|
||||
722,1.0,6
|
||||
723,1.0,6
|
||||
724,1.0,6
|
||||
725,1.0,10
|
||||
726,1.0,6
|
||||
727,1.0,6
|
||||
728,1.0,6
|
||||
729,1.0,6
|
||||
730,1.0,6
|
||||
731,1.0,7
|
||||
732,1.0,6
|
||||
733,1.0,8
|
||||
734,1.0,7
|
||||
735,1.0,6
|
||||
736,1.0,6
|
||||
737,1.0,14
|
||||
738,1.0,6
|
||||
739,1.0,6
|
||||
740,1.0,12
|
||||
741,1.0,6
|
||||
742,1.0,6
|
||||
743,1.0,6
|
||||
744,1.0,6
|
||||
745,1.0,6
|
||||
746,1.0,6
|
||||
747,0.0,3
|
||||
748,1.0,6
|
||||
749,1.0,6
|
||||
750,1.0,6
|
||||
751,1.0,7
|
||||
752,1.0,6
|
||||
753,1.0,6
|
||||
754,1.0,6
|
||||
755,1.0,8
|
||||
756,0.0,2
|
||||
757,1.0,6
|
||||
758,1.0,6
|
||||
759,1.0,6
|
||||
760,1.0,6
|
||||
761,1.0,6
|
||||
762,1.0,6
|
||||
763,1.0,6
|
||||
764,1.0,6
|
||||
765,1.0,6
|
||||
766,0.0,4
|
||||
767,1.0,8
|
||||
768,1.0,6
|
||||
769,0.0,2
|
||||
770,1.0,10
|
||||
771,1.0,8
|
||||
772,1.0,6
|
||||
773,1.0,6
|
||||
774,1.0,6
|
||||
775,0.0,3
|
||||
776,1.0,6
|
||||
777,1.0,6
|
||||
778,0.0,6
|
||||
779,1.0,8
|
||||
780,1.0,6
|
||||
781,1.0,9
|
||||
782,1.0,6
|
||||
783,1.0,6
|
||||
784,1.0,8
|
||||
785,1.0,8
|
||||
786,1.0,6
|
||||
787,0.0,5
|
||||
788,1.0,6
|
||||
789,1.0,6
|
||||
790,1.0,6
|
||||
791,1.0,6
|
||||
792,1.0,6
|
||||
793,1.0,6
|
||||
794,1.0,8
|
||||
795,1.0,6
|
||||
796,0.0,2
|
||||
797,1.0,8
|
||||
798,1.0,7
|
||||
799,1.0,6
|
||||
|
@@ -0,0 +1 @@
|
||||
{"algo_name": "Q-learning", "env_name": "Racetrack-v0", "train_eps": 400, "test_eps": 20, "gamma": 0.9, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 300, "lr": 0.1, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/Racetrack-v0/20220826-224626/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/Racetrack-v0/20220826-224626/models/", "n_states": 4, "n_actions": 9}
|
||||