Merge branch 'master' of github.com:datawhalechina/easy-rl

This commit is contained in:
qiwang067
2022-08-18 16:05:28 +08:00
109 changed files with 3483 additions and 1011 deletions

318
projects/PARL/DQN.ipynb Normal file
View File

@@ -0,0 +1,318 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 定义模型\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import paddle\n",
"import paddle.nn as nn\n",
"import paddle.nn.functional as F\n",
"import parl\n",
"\n",
"class CartpoleModel(parl.Model):\n",
" \"\"\" Linear network to solve Cartpole problem.\n",
" Args:\n",
" n_states (int): Dimension of observation space.\n",
" n_actions (int): Dimension of action space.\n",
" \"\"\"\n",
"\n",
" def __init__(self, n_states, n_actions):\n",
" super(CartpoleModel, self).__init__()\n",
" hid1_size = 128\n",
" hid2_size = 128\n",
" self.fc1 = nn.Linear(n_states, hid1_size)\n",
" self.fc2 = nn.Linear(hid1_size, hid2_size)\n",
" self.fc3 = nn.Linear(hid2_size, n_actions)\n",
"\n",
" def forward(self, obs):\n",
" h1 = F.relu(self.fc1(obs))\n",
" h2 = F.relu(self.fc2(h1))\n",
" Q = self.fc3(h2)\n",
" return Q"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import parl\n",
"import paddle\n",
"import numpy as np\n",
"\n",
"\n",
"class CartpoleAgent(parl.Agent):\n",
" \"\"\"Agent of Cartpole env.\n",
" Args:\n",
" algorithm(parl.Algorithm): algorithm used to solve the problem.\n",
" \"\"\"\n",
"\n",
" def __init__(self, algorithm, n_actions, e_greed=0.1, e_greed_decrement=0):\n",
" super(CartpoleAgent, self).__init__(algorithm)\n",
" assert isinstance(n_actions, int)\n",
" self.n_actions = n_actions\n",
"\n",
" self.global_step = 0\n",
" self.update_target_steps = 200\n",
"\n",
" self.e_greed = e_greed\n",
" self.e_greed_decrement = e_greed_decrement\n",
"\n",
" def sample(self, obs):\n",
" \"\"\"Sample an action `for exploration` when given an observation\n",
" Args:\n",
" obs(np.float32): shape of (n_states,)\n",
" Returns:\n",
" act(int): action\n",
" \"\"\"\n",
" sample = np.random.random()\n",
" if sample < self.e_greed:\n",
" act = np.random.randint(self.n_actions)\n",
" else:\n",
" if np.random.random() < 0.01:\n",
" act = np.random.randint(self.n_actions)\n",
" else:\n",
" act = self.predict(obs)\n",
" self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)\n",
" return act\n",
"\n",
" def predict(self, obs):\n",
" \"\"\"Predict an action when given an observation\n",
" Args:\n",
" obs(np.float32): shape of (n_states,)\n",
" Returns:\n",
" act(int): action\n",
" \"\"\"\n",
" obs = paddle.to_tensor(obs, dtype='float32')\n",
" pred_q = self.alg.predict(obs)\n",
" act = pred_q.argmax().numpy()[0]\n",
" return act\n",
"\n",
" def learn(self, obs, act, reward, next_obs, terminal):\n",
" \"\"\"Update model with an episode data\n",
" Args:\n",
" obs(np.float32): shape of (batch_size, n_states)\n",
" act(np.int32): shape of (batch_size)\n",
" reward(np.float32): shape of (batch_size)\n",
" next_obs(np.float32): shape of (batch_size, n_states)\n",
" terminal(np.float32): shape of (batch_size)\n",
" Returns:\n",
" loss(float)\n",
" \"\"\"\n",
" if self.global_step % self.update_target_steps == 0:\n",
" self.alg.sync_target()\n",
" self.global_step += 1\n",
"\n",
" act = np.expand_dims(act, axis=-1)\n",
" reward = np.expand_dims(reward, axis=-1)\n",
" terminal = np.expand_dims(terminal, axis=-1)\n",
"\n",
" obs = paddle.to_tensor(obs, dtype='float32')\n",
" act = paddle.to_tensor(act, dtype='int32')\n",
" reward = paddle.to_tensor(reward, dtype='float32')\n",
" next_obs = paddle.to_tensor(next_obs, dtype='float32')\n",
" terminal = paddle.to_tensor(terminal, dtype='float32')\n",
" loss = self.alg.learn(obs, act, reward, next_obs, terminal)\n",
" return loss.numpy()[0]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import gym\n",
"import numpy as np\n",
"import parl\n",
"\n",
"from parl.utils import logger, ReplayMemory\n",
"from parl.algorithms import DQN\n",
"\n",
"LEARN_FREQ = 5 # training frequency\n",
"MEMORY_SIZE = 200000\n",
"MEMORY_WARMUP_SIZE = 200\n",
"BATCH_SIZE = 64\n",
"LEARNING_RATE = 0.0005\n",
"GAMMA = 0.99\n",
"\n",
"# train an episode\n",
"def run_train_episode(agent, env, rpm):\n",
" total_reward = 0\n",
" obs = env.reset()\n",
" step = 0\n",
" while True:\n",
" step += 1\n",
" action = agent.sample(obs)\n",
" next_obs, reward, done, _ = env.step(action)\n",
" rpm.append(obs, action, reward, next_obs, done)\n",
"\n",
" # train model\n",
" if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):\n",
" # s,a,r,s',done\n",
" (batch_obs, batch_action, batch_reward, batch_next_obs,\n",
" batch_done) = rpm.sample_batch(BATCH_SIZE)\n",
" train_loss = agent.learn(batch_obs, batch_action, batch_reward,\n",
" batch_next_obs, batch_done)\n",
"\n",
" total_reward += reward\n",
" obs = next_obs\n",
" if done:\n",
" break\n",
" return total_reward\n",
"\n",
"\n",
"# evaluate 5 episodes\n",
"def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):\n",
" eval_reward = []\n",
" for i in range(eval_episodes):\n",
" obs = env.reset()\n",
" episode_reward = 0\n",
" while True:\n",
" action = agent.predict(obs)\n",
" obs, reward, done, _ = env.step(action)\n",
" episode_reward += reward\n",
" if render:\n",
" env.render()\n",
" if done:\n",
" break\n",
" eval_reward.append(episode_reward)\n",
" return np.mean(eval_reward)\n",
"\n",
"\n",
"def main(args):\n",
" env = gym.make('CartPole-v0')\n",
" n_states = env.observation_space.shape[0]\n",
" n_actions = env.action_space.n\n",
" logger.info('n_states {}, n_actions {}'.format(n_states, n_actions))\n",
"\n",
" # set action_shape = 0 while in discrete control environment\n",
" rpm = ReplayMemory(MEMORY_SIZE, n_states, 0)\n",
"\n",
" # build an agent\n",
" model = CartpoleModel(n_states=n_states, n_actions=n_actions)\n",
" alg = DQN(model, gamma=GAMMA, lr=LEARNING_RATE)\n",
" agent = CartpoleAgent(\n",
" alg, n_actions=n_actions, e_greed=0.1, e_greed_decrement=1e-6)\n",
"\n",
" # warmup memory\n",
" while len(rpm) < MEMORY_WARMUP_SIZE:\n",
" run_train_episode(agent, env, rpm)\n",
"\n",
" max_episode = args.max_episode\n",
"\n",
" # start training\n",
" episode = 0\n",
" while episode < max_episode:\n",
" # train part\n",
" for i in range(50):\n",
" total_reward = run_train_episode(agent, env, rpm)\n",
" episode += 1\n",
"\n",
" # test part\n",
" eval_reward = run_evaluate_episodes(agent, env, render=False)\n",
" logger.info('episode:{} e_greed:{} Test reward:{}'.format(\n",
" episode, agent.e_greed, eval_reward))\n",
"\n",
" # save the parameters to ./model.ckpt\n",
" save_path = './model.ckpt'\n",
" agent.save(save_path)\n",
"\n",
" # save the model and parameters of policy network for inference\n",
" save_inference_path = './inference_model'\n",
" input_shapes = [[None, env.observation_space.shape[0]]]\n",
" input_dtypes = ['float32']\n",
" agent.save_inference_model(save_inference_path, input_shapes, input_dtypes)\n",
"\n",
"\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:64]\u001b[0m obs_dim 4, act_dim 2\n",
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:92]\u001b[0m episode:50 e_greed:0.0988929999999989 Test reward:18.4\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:100 e_greed:0.09794799999999795 Test reward:9.6\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:150 e_greed:0.0973899999999974 Test reward:37.8\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:200 e_greed:0.09684299999999685 Test reward:8.8\n",
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:250 e_greed:0.09635499999999636 Test reward:9.4\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:300 e_greed:0.09585299999999586 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:350 e_greed:0.09535799999999536 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:400 e_greed:0.09486399999999487 Test reward:10.0\n",
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:450 e_greed:0.09435299999999436 Test reward:9.2\n",
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:500 e_greed:0.09384899999999385 Test reward:9.4\n",
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:550 e_greed:0.09302299999999303 Test reward:69.0\n",
"\u001b[32m[08-01 21:48:25 MainThread @3996942455.py:92]\u001b[0m episode:600 e_greed:0.08774199999998775 Test reward:141.2\n",
"\u001b[32m[08-01 21:48:30 MainThread @3996942455.py:92]\u001b[0m episode:650 e_greed:0.0791019999999791 Test reward:184.0\n",
"\u001b[32m[08-01 21:48:35 MainThread @3996942455.py:92]\u001b[0m episode:700 e_greed:0.07011299999997012 Test reward:182.0\n",
"\u001b[32m[08-01 21:48:40 MainThread @3996942455.py:92]\u001b[0m episode:750 e_greed:0.06089099999996089 Test reward:197.4\n",
"\u001b[32m[08-01 21:48:45 MainThread @3996942455.py:92]\u001b[0m episode:800 e_greed:0.05139199999995139 Test reward:183.4\n",
"\u001b[32m[08-01 21:48:50 MainThread @3996942455.py:92]\u001b[0m episode:850 e_greed:0.042255999999942256 Test reward:153.0\n",
"\u001b[32m[08-01 21:48:55 MainThread @3996942455.py:92]\u001b[0m episode:900 e_greed:0.033495999999933496 Test reward:192.6\n",
"\u001b[32m[08-01 21:49:00 MainThread @3996942455.py:92]\u001b[0m episode:950 e_greed:0.024318999999924318 Test reward:166.6\n",
"\u001b[32m[08-01 21:49:06 MainThread @3996942455.py:92]\u001b[0m episode:1000 e_greed:0.014873999999916176 Test reward:187.0\n"
]
}
],
"source": [
"import argparse\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument(\n",
" '--max_episode',\n",
" type=int,\n",
" default=1000,\n",
" help='stop condition: number of max episode')\n",
"args = parser.parse_args(args=[])\n",
"\n",
"main(args)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.12 ('rl_tutorials')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4f613f1ab80ec98dc1b91d6e720de51301598a187317378e53e49b773c1123dd"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

11
projects/PARL/README.md Normal file
View File

@@ -0,0 +1,11 @@
[PARL](https://github.com/PaddlePaddle/PARL)是一个高性能、灵活的强化学习框架由百度AI Studio开发。
## 安装
1. 安装parl参考[PARL Github](https://github.com/PaddlePaddle/PARL)
2. 安装paddlepaddle```pip install paddlepaddle```
## 常见问题
```jupyter-client 7.3.1 requires pyzmq>=22.3, but you have pyzmq 18.1.1 which is incompatible.```:
```pip install -U pyzmq```

View File

@@ -1,49 +1,34 @@
## 0、写在前面
本项目用于学习RL基础算法尽量做到: **注释详细**(经过很长时间的纠结,还是中文注释好了!!!)**结构清晰**。
本项目用于学习RL基础算法主要面向对象为RL初学者、需要结合RL的非专业学习者尽量做到: **(中文)注释详细****结构清晰**。
代码结构主要分为以下几个脚本:
注意本项目为实战内容,建议首先掌握相关算法的一些理论基础,再来享用本项目,理论教程参考本人参与编写的[蘑菇书](https://github.com/datawhalechina/easy-rl)。
未来开发计划包括但不限于多智能体算法、强化学习Python包以及强化学习图形化编程平台等等。
## 1、项目说明
项目内容主要包含以下几个部分:
* [Jupyter Notebook](./notebooks/)使用Notebook写的算法有比较详细的实战引导推荐新手食用
* [codes](./assets/)这些是基于Python脚本写的算法风格比较接近实际项目的写法推荐有一定代码基础的人阅读下面会说明其具体的一些架构
* [parl](./PARL/):应业务需求,写了一些基于百度飞浆平台和```parl```模块的RL实例
* [附件](./assets/):目前包含强化学习各算法的中文伪代码
[codes](./assets/)结构主要分为以下几个脚本:
* ```[algorithm_name].py```:即保存算法的脚本,例如```dqn.py```,每种算法都会有一定的基础模块,例如```Replay Buffer```、```MLP```(多层感知机)等等;
* ```task.py```: 即保存任务的脚本,基本包括基于```argparse```模块的参数,训练以及测试函数等等;
* ```utils.py```:该脚本用于保存诸如存储结果以及画图的软件,在实际项目或研究中,推荐大家使用```Tensorboard```来保存结果,然后使用诸如```matplotlib```以及```seabron```来进一步画图。
## 运行环境
## 2、运行环境
python 3.7、pytorch 1.6.0-1.9.0、gym 0.21.0
或者在```README.md```目录下执行以下命令复现环境:
在项目根目录下执行以下命令复现环境:
```bash
conda env create -f environment.yaml
pip install -r requirements.txt
```
## 使用说明
## 3、使用说明
直接运行带有```train```的py文件或ipynb文件会进行训练默认的任务
也可以运行带有```task```的py文件训练不同的任务
## 内容导航
| 算法名称 | 相关论文材料 | 环境 | 备注 |
| :--------------------------------------: | :----------------------------------------------------------: | ----------------------------------------- | :--------------------------------: |
| [On-Policy First-Visit MC](./MonteCarlo) | [medium blog](https://medium.com/analytics-vidhya/monte-carlo-methods-in-reinforcement-learning-part-1-on-policy-methods-1f004d59686a) | [Racetrack](./envs/racetrack_env.md) | |
| [Q-Learning](./QLearning) | [towardsdatascience blog](https://towardsdatascience.com/simple-reinforcement-learning-q-learning-fcddc4b6fe56),[q learning paper](https://ieeexplore.ieee.org/document/8836506) | [CliffWalking-v0](./envs/gym_info.md) | |
| [Sarsa](./Sarsa) | [geeksforgeeks blog](https://www.geeksforgeeks.org/sarsa-reinforcement-learning/) | [Racetrack](./envs/racetrack_env.md) | |
| [DQN](./DQN) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),[Nature DQN Paper](https://www.nature.com/articles/nature14236) | [CartPole-v0](./envs/gym_info.md) | |
| [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
| [DoubleDQN](./DoubleDQN) | [DoubleDQN Paper](https://arxiv.org/abs/1509.06461) | [CartPole-v0](./envs/gym_info.md) | |
| [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | |
| [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | |
| [A2C](./A2C) | [A3C Paper](https://arxiv.org/abs/1602.01783) | [CartPole-v0](./envs/gym_info.md) | |
| [SAC](./SoftActorCritic) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
| [DDPG](./DDPG) | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
| [TD3](./TD3) | [TD3 Paper](https://arxiv.org/abs/1802.09477) | [HalfCheetah-v2]((./envs/mujoco_info.md)) | |
## Refs
[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2)
[RL-Adventure](https://github.com/higgsfield/RL-Adventure)
[Google 开源项目风格指南——中文版](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)

View File

@@ -0,0 +1,4 @@
\relax
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{1}{}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{2}{}\protected@file@percent }
\gdef \@abspage@last{2}

View File

@@ -0,0 +1,398 @@
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 15 AUG 2022 15:05
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
%&-line parsing enabled.
**/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes
(/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes.tex
LaTeX2e <2020-10-01> patch level 4
L3 programming layer <2021-02-18> (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexart.cls (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexbackend.cfg
File: ctexbackend.cfg 2021/03/14 v2.5.6 Backend configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3kernel/expl3.sty
Package: expl3 2021-02-18 L3 programming layer (loader)
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
File: l3backend-xetex.def 2021-03-18 L3 backend support: XeTeX
(|extractbb --version)
\c__kernel_sys_dvipdfmx_version_int=\count175
\l__color_backend_stack_int=\count176
\g__color_backend_stack_int=\count177
\g__graphics_track_int=\count178
\l__pdf_internal_box=\box47
\g__pdf_backend_object_int=\count179
\g__pdf_backend_annotation_int=\count180
\g__pdf_backend_link_int=\count181
))
Document Class: ctexart 2021/03/14 v2.5.6 Chinese adapter for class article (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-2020-10-01.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-generic.tex))) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.sty
Package: l3keys2e 2021-03-12 LaTeX2e option processing using LaTeX3 keys
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexhook.sty
Package: ctexhook 2021/03/14 v2.5.6 Document and package hooks (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexpatch.sty
Package: ctexpatch 2021/03/14 v2.5.6 Patching commands (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/fix-cm.sty
Package: fix-cm 2015/01/14 v1.1t fixes to LaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/ts1enc.def
File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
LaTeX Font Info: Redeclaring font encoding TS1 on input line 47.
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel.sty
Package: everysel 2021/01/20 v2.1 EverySelectfont Package (MS)
(/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel-2011-10-28.sty))
\l__ctex_tmp_int=\count182
\l__ctex_tmp_box=\box48
\l__ctex_tmp_dim=\dimen138
\g__ctex_section_depth_int=\count183
\g__ctex_font_size_int=\count184
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexopts.cfg
File: ctexopts.cfg 2021/03/14 v2.5.6 Option configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/article.cls
Document Class: article 2020/04/10 v1.4m Standard LaTeX document class
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/size11.clo
File: size11.clo 2020/04/10 v1.4m Standard LaTeX file (size option)
)
\c@part=\count185
\c@section=\count186
\c@subsection=\count187
\c@subsubsection=\count188
\c@paragraph=\count189
\c@subparagraph=\count190
\c@figure=\count191
\c@table=\count192
\abovecaptionskip=\skip47
\belowcaptionskip=\skip48
\bibindent=\dimen139
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/engine/ctex-engine-xetex.def
File: ctex-engine-xetex.def 2021/03/14 v2.5.6 XeLaTeX adapter (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.sty
Package: xeCJK 2020/10/19 v3.8.6 Typesetting CJK scripts with XeLaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xtemplate/xtemplate.sty
Package: xtemplate 2021-03-12 L3 Experimental prototype document functions
\l__xtemplate_tmp_dim=\dimen140
\l__xtemplate_tmp_int=\count193
\l__xtemplate_tmp_muskip=\muskip16
\l__xtemplate_tmp_skip=\skip49
)
\l__xeCJK_tmp_int=\count194
\l__xeCJK_tmp_box=\box49
\l__xeCJK_tmp_dim=\dimen141
\l__xeCJK_tmp_skip=\skip50
\g__xeCJK_space_factor_int=\count195
\l__xeCJK_begin_int=\count196
\l__xeCJK_end_int=\count197
\c__xeCJK_CJK_class_int=\XeTeXcharclass1
\c__xeCJK_FullLeft_class_int=\XeTeXcharclass2
\c__xeCJK_FullRight_class_int=\XeTeXcharclass3
\c__xeCJK_HalfLeft_class_int=\XeTeXcharclass4
\c__xeCJK_HalfRight_class_int=\XeTeXcharclass5
\c__xeCJK_NormalSpace_class_int=\XeTeXcharclass6
\c__xeCJK_CM_class_int=\XeTeXcharclass7
\c__xeCJK_HangulJamo_class_int=\XeTeXcharclass8
\l__xeCJK_last_skip=\skip51
\g__xeCJK_node_int=\count198
\c__xeCJK_CJK_node_dim=\dimen142
\c__xeCJK_CJK-space_node_dim=\dimen143
\c__xeCJK_default_node_dim=\dimen144
\c__xeCJK_default-space_node_dim=\dimen145
\c__xeCJK_CJK-widow_node_dim=\dimen146
\c__xeCJK_normalspace_node_dim=\dimen147
\l__xeCJK_ccglue_skip=\skip52
\l__xeCJK_ecglue_skip=\skip53
\l__xeCJK_punct_kern_skip=\skip54
\l__xeCJK_last_penalty_int=\count199
\l__xeCJK_last_bound_dim=\dimen148
\l__xeCJK_last_kern_dim=\dimen149
\l__xeCJK_widow_penalty_int=\count266
Package xtemplate Info: Declaring object type 'xeCJK/punctuation' taking 0
(xtemplate) argument(s) on line 2341.
\l__xeCJK_fixed_punct_width_dim=\dimen150
\l__xeCJK_mixed_punct_width_dim=\dimen151
\l__xeCJK_middle_punct_width_dim=\dimen152
\l__xeCJK_fixed_margin_width_dim=\dimen153
\l__xeCJK_mixed_margin_width_dim=\dimen154
\l__xeCJK_middle_margin_width_dim=\dimen155
\l__xeCJK_bound_punct_width_dim=\dimen156
\l__xeCJK_bound_margin_width_dim=\dimen157
\l__xeCJK_margin_minimum_dim=\dimen158
\l__xeCJK_kerning_total_width_dim=\dimen159
\l__xeCJK_same_align_margin_dim=\dimen160
\l__xeCJK_different_align_margin_dim=\dimen161
\l__xeCJK_kerning_margin_width_dim=\dimen162
\l__xeCJK_kerning_margin_minimum_dim=\dimen163
\l__xeCJK_bound_dim=\dimen164
\l__xeCJK_reverse_bound_dim=\dimen165
\l__xeCJK_margin_dim=\dimen166
\l__xeCJK_minimum_bound_dim=\dimen167
\l__xeCJK_kerning_margin_dim=\dimen168
\g__xeCJK_family_int=\count267
\l__xeCJK_fam_int=\count268
\g__xeCJK_fam_allocation_int=\count269
\l__xeCJK_verb_case_int=\count270
\l__xeCJK_verb_exspace_skip=\skip55
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.sty
Package: fontspec 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
Package: fontspec-xetex 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
\l__fontspec_script_int=\count271
\l__fontspec_language_int=\count272
\l__fontspec_strnum_int=\count273
\l__fontspec_tmp_int=\count274
\l__fontspec_tmpa_int=\count275
\l__fontspec_tmpb_int=\count276
\l__fontspec_tmpc_int=\count277
\l__fontspec_em_int=\count278
\l__fontspec_emdef_int=\count279
\l__fontspec_strong_int=\count280
\l__fontspec_strongdef_int=\count281
\l__fontspec_tmpa_dim=\dimen169
\l__fontspec_tmpb_dim=\dimen170
\l__fontspec_tmpc_dim=\dimen171
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/fontenc.sty
Package: fontenc 2020/08/10 v2.0s Standard LaTeX package
) (/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
File: xeCJK.cfg 2020/10/19 v3.8.6 Configuration file for xeCJK package
))
\ccwd=\dimen172
\l__ctex_ccglue_skip=\skip56
)
\l__ctex_ziju_dim=\dimen173
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber.sty
Package: zhnumber 2020/05/01 v2.8 Typesetting numbers with Chinese glyphs
\l__zhnum_scale_int=\count282
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber-utf8.cfg
File: zhnumber-utf8.cfg 2020/05/01 v2.8 Chinese numerals with UTF8 encoding
))
\l__ctex_heading_skip=\skip57
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/scheme/ctex-scheme-chinese-article.def
File: ctex-scheme-chinese-article.def 2021/03/14 v2.5.6 Chinese scheme for article (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex-name-utf8.cfg
File: ctex-name-utf8.cfg 2021/03/14 v2.5.6 Caption with encoding UTF-8 (CTEX)
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-mac.def
File: ctex-fontset-mac.def 2021/03/14 v2.5.6 macOS fonts definition (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-macnew.def
File: ctex-fontset-macnew.def 2021/03/14 v2.5.6 macOS fonts definition for El Capitan or later version (CTEX)
Package fontspec Warning: Font "Songti SC Light" does not contain requested
(fontspec) Script "CJK".
Package fontspec Info: Font family 'SongtiSCLight(0)' created for font 'Songti
(fontspec) SC Light' with options
(fontspec) [Script={CJK},BoldItalicFont={Kaiti SC
(fontspec) Bold},BoldFont={Songti SC Bold},ItalicFont={Kaiti SC}].
(fontspec)
(fontspec) This font family consists of the following NFSS
(fontspec) series/shapes:
(fontspec)
(fontspec) - 'normal' (m/n) with NFSS spec.: <->"Songti SC
(fontspec) Light/OT:language=dflt;"
(fontspec) - 'small caps' (m/sc) with NFSS spec.:
(fontspec) - 'bold' (b/n) with NFSS spec.: <->"Songti SC
(fontspec) Bold/OT:language=dflt;"
(fontspec) - 'bold small caps' (b/sc) with NFSS spec.:
(fontspec) - 'italic' (m/it) with NFSS spec.: <->"Kaiti
(fontspec) SC/OT:language=dflt;"
(fontspec) - 'italic small caps' (m/scit) with NFSS spec.:
(fontspec) - 'bold italic' (b/it) with NFSS spec.: <->"Kaiti SC
(fontspec) Bold/OT:language=dflt;"
(fontspec) - 'bold italic small caps' (b/scit) with NFSS spec.:
))) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex.cfg
File: ctex.cfg 2021/03/14 v2.5.6 Configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithm.sty
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
(/usr/local/texlive/2021/texmf-dist/tex/latex/float/float.sty
Package: float 2001/11/08 v1.3d Float enhancements (AL)
\c@float@type=\count283
\float@exts=\toks15
\float@box=\box50
\@float@everytoks=\toks16
\@floatcapt=\box51
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/ifthen.sty
Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
)
\@float@every@algorithm=\toks17
\c@algorithm=\count284
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithmic.sty
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
(/usr/local/texlive/2021/texmf-dist/tex/latex/graphics/keyval.sty
Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
\KV@toks@=\toks18
)
\c@ALC@unique=\count285
\c@ALC@line=\count286
\c@ALC@rem=\count287
\c@ALC@depth=\count288
\ALC@tlm=\skip58
\algorithmicindent=\skip59
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amssymb.sty
Package: amssymb 2013/01/14 v3.01 AMS font symbols
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amsfonts.sty
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
\@emptytoks=\toks19
\symAMSa=\mathgroup4
\symAMSb=\mathgroup5
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsmath.sty
Package: amsmath 2020/09/23 v2.17i AMS math features
\@mathmargin=\skip60
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amstext.sty
Package: amstext 2000/06/29 v2.01 AMS text
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsgen.sty
File: amsgen.sty 1999/11/30 v2.0 generic functions
\@emptytoks=\toks20
\ex@=\dimen174
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsbsy.sty
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
\pmbraise@=\dimen175
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsopn.sty
Package: amsopn 2016/03/08 v2.02 operator names
)
\inf@bad=\count289
LaTeX Info: Redefining \frac on input line 234.
\uproot@=\count290
\leftroot@=\count291
LaTeX Info: Redefining \overline on input line 399.
\classnum@=\count292
\DOTSCASE@=\count293
LaTeX Info: Redefining \ldots on input line 496.
LaTeX Info: Redefining \dots on input line 499.
LaTeX Info: Redefining \cdots on input line 620.
\Mathstrutbox@=\box52
\strutbox@=\box53
\big@size=\dimen176
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
\macc@depth=\count294
\c@MaxMatrixCols=\count295
\dotsspace@=\muskip17
\c@parentequation=\count296
\dspbrk@lvl=\count297
\tag@help=\toks21
\row@=\count298
\column@=\count299
\maxfields@=\count300
\andhelp@=\toks22
\eqnshift@=\dimen177
\alignsep@=\dimen178
\tagshift@=\dimen179
\tagwidth@=\dimen180
\totwidth@=\dimen181
\lineht@=\dimen182
\@envbody=\toks23
\multlinegap=\skip61
\multlinetaggap=\skip62
\mathdisplay@stack=\toks24
LaTeX Info: Redefining \[ on input line 2923.
LaTeX Info: Redefining \] on input line 2924.
) (./pseudocodes.aux)
\openout1 = `pseudocodes.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 9.
LaTeX Font Info: ... okay on input line 9.
ABD: EverySelectfont initializing macros
LaTeX Info: Redefining \selectfont on input line 9.
Package fontspec Info: Adjusting the maths setup (use [no-math] to avoid
(fontspec) this).
\symlegacymaths=\mathgroup6
LaTeX Font Info: Overwriting symbol font `legacymaths' in version `bold'
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 9.
LaTeX Font Info: Redeclaring math accent \acute on input line 9.
LaTeX Font Info: Redeclaring math accent \grave on input line 9.
LaTeX Font Info: Redeclaring math accent \ddot on input line 9.
LaTeX Font Info: Redeclaring math accent \tilde on input line 9.
LaTeX Font Info: Redeclaring math accent \bar on input line 9.
LaTeX Font Info: Redeclaring math accent \breve on input line 9.
LaTeX Font Info: Redeclaring math accent \check on input line 9.
LaTeX Font Info: Redeclaring math accent \hat on input line 9.
LaTeX Font Info: Redeclaring math accent \dot on input line 9.
LaTeX Font Info: Redeclaring math accent \mathring on input line 9.
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 9.
LaTeX Font Info: Redeclaring math symbol \Delta on input line 9.
LaTeX Font Info: Redeclaring math symbol \Theta on input line 9.
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 9.
LaTeX Font Info: Redeclaring math symbol \Xi on input line 9.
LaTeX Font Info: Redeclaring math symbol \Pi on input line 9.
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 9.
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 9.
LaTeX Font Info: Redeclaring math symbol \Phi on input line 9.
LaTeX Font Info: Redeclaring math symbol \Psi on input line 9.
LaTeX Font Info: Redeclaring math symbol \Omega on input line 9.
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 9.
LaTeX Font Info: Redeclaring symbol font `operators' on input line 9.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `normal' on input line 9.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 9.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `bold' on input line 9.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 9.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal'
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal'
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal'
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal'
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 9.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold'
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold'
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 9.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold'
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 9.
LaTeX Font Info: Trying to load font information for U+msa on input line 20.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsa.fd
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
)
LaTeX Font Info: Trying to load font information for U+msb on input line 20.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsb.fd
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
)
Overfull \hbox (38.0069pt too wide) in paragraph at lines 32--33
[] []\TU/SongtiSCLight(0)/m/n/10.95 计 算 实 际 的 $\OML/cmm/m/it/10.95 Q$ \TU/SongtiSCLight(0)/m/n/10.95 值,| 即 $\OML/cmm/m/it/10.95 y[] \OT1/cmr/m/n/10.95 = []$
[]
[1
] [2
] (./pseudocodes.aux) )
Here is how much of TeX's memory you used:
7847 strings out of 476919
208964 string characters out of 5821840
529246 words of memory out of 5000000
27739 multiletter control sequences out of 15000+600000
410995 words of font info for 73 fonts, out of 8000000 for 9000
1348 hyphenation exceptions out of 8191
101i,11n,104p,414b,663s stack positions out of 5000i,500n,10000p,200000b,80000s
Output written on pseudocodes.pdf (2 pages).

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,63 @@
\documentclass[11pt]{ctexart}
\usepackage{ctex}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amssymb}
\usepackage{amsmath}
\begin{document}
\begin{algorithm}
\floatname{algorithm}{{DQN算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
\renewcommand{\algorithmicensure}{\textbf{输出:}}
\begin{algorithmic}
% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
% \ENSURE $y = x^n$ % 输出
\STATE 初始化策略网络参数$\theta$ % 初始化
\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
\STATE 重置环境,获得初始状态$s_t$
\FOR {时步 = $1,t$}
\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE$D$中采样一个batch的transition
\STATE 计算实际的$Q$值,即$y_{j}= \begin{cases}r_{j} & \text {对于终止状态} s_{j+1} \\ r_{j}+\gamma \max _{a^{\prime}} Q\left(s_{j+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{j+1}\end{cases}$
\STATE 对损失 $\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降
\STATE$C$步复制参数$\hat{Q} \leftarrow Q$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\clearpage
\begin{algorithm}
\floatname{algorithm}{{SoftQ算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}
\STATE 初始化参数$\theta$$\phi$% 初始化
\STATE 复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
\FOR {时步 = $1,t$}
\STATE 根据$a_{t} \leftarrow f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)$采样动作,其中$\xi \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$
\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE 待完善
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\end{document}

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2022-07-20 23:57:16
LastEditTime: 2022-08-11 09:52:23
@Discription:
@Environment: python 3.7.7
'''
@@ -14,77 +14,39 @@ LastEditTime: 2022-07-20 23:57:16
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import math
import numpy as np
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class DQN:
def __init__(self, n_states,n_actions,cfg):
def __init__(self,n_actions,model,memory,cfg):
self.n_actions = n_actions
self.device = torch.device(cfg.device) # cpu or cuda
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.sample_count = 0 # 用于epsilon的衰减计数
self.epsilon = cfg.epsilon_start
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP(n_states,n_actions).to(self.device)
self.target_net = MLP(n_states,n_actions).to(self.device)
self.policy_net = model.to(self.device)
self.target_net = model.to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
self.memory = memory # 经验回放
def choose_action(self, state):
def sample(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的这里选择指数递减
if random.random() > self.epsilon:
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
q_values = self.policy_net(state)
@@ -92,11 +54,16 @@ class DQN:
else:
action = random.randrange(self.n_actions)
return action
def predict(self,state):
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
# print('updating')
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
@@ -118,9 +85,11 @@ class DQN:
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
from pathlib import Path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

View File

@@ -1,134 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import random
import math
class CNN(nn.Module):
def __init__(self, input_dim, output_dim):
super(CNN, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.features = nn.Sequential(
nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4),
nn.ReLU(),
nn.Conv2d(32, 64, kernel_size=4, stride=2),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=1),
nn.ReLU()
)
self.fc = nn.Sequential(
nn.Linear(self.feature_size(), 512),
nn.ReLU(),
nn.Linear(512, self.output_dim)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def feature_size(self):
return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1)
def act(self, state, epsilon):
if random.random() > epsilon:
state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
q_value = self.forward(state)
action = q_value.max(1)[1].data[0]
else:
action = random.randrange(env.action_space.n)
return action
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
def choose_action(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
print(type(state))
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
# 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
# 优化更新模型
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

View File

@@ -1,142 +0,0 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import random
import math
import numpy as np
class CNN(nn.Module):
def __init__(self, n_frames, n_actions):
super(CNN,self).__init__()
self.n_frames = n_frames
self.n_actions = n_actions
# Layers
self.conv1 = nn.Conv2d(
in_channels=n_frames,
out_channels=16,
kernel_size=8,
stride=4,
padding=2
)
self.conv2 = nn.Conv2d(
in_channels=16,
out_channels=32,
kernel_size=4,
stride=2,
padding=1
)
self.fc1 = nn.Linear(
in_features=3200,
out_features=256,
)
self.fc2 = nn.Linear(
in_features=256,
out_features=n_actions,
)
# Activation Functions
self.relu = nn.ReLU()
def flatten(self, x):
batch_size = x.size()[0]
x = x.view(batch_size, -1)
return x
def forward(self, x):
# Forward pass
x = self.relu(self.conv1(x)) # In: (80, 80, 4) Out: (20, 20, 16)
x = self.relu(self.conv2(x)) # In: (20, 20, 16) Out: (10, 10, 32)
x = self.flatten(x) # In: (10, 10, 32) Out: (3200,)
x = self.relu(self.fc1(x)) # In: (3200,) Out: (256,)
x = self.fc2(x) # In: (256,) Out: (4,)
return x
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
def choose_action(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
# 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
# 优化更新模型
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

View File

@@ -1,19 +0,0 @@
{
"algo_name": "DQN",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.95,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 500,
"lr": 0.0001,
"memory_capacity": 100000,
"batch_size": 64,
"target_update": 4,
"hidden_dim": 256,
"deivce": "cpu",
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-211653/results/",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-211653/models/",
"save_fig": true
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 48 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/DQN/outputs/CartPole-v0/20220815-185119/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/DQN/outputs/CartPole-v0/20220815-185119/models/", "show_fig": false, "save_fig": true}

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -1,23 +1,23 @@
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add to system path
import torch.nn as nn
import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
import argparse
from common.utils import save_results, make_dir
from common.utils import save_results
from common.utils import plot_rewards,save_args
from common.models import MLP
from common.memories import ReplayBuffer
from dqn import DQN
def get_args():
""" Hyperparameters
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
@@ -36,7 +36,8 @@ def get_args():
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
'/' + curr_time + '/models/' )
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
@@ -47,8 +48,10 @@ def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
agent = DQN(n_states,n_actions, cfg) # 创建智能体
print(f"状态数:{n_states},动作数:{n_actions}")
model = MLP(n_states,n_actions,hidden_dim=cfg.hidden_dim)
memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
agent = DQN(n_actions,model,memory,cfg) # 创建智能体
if seed !=0: # 设置随机种子
torch.manual_seed(seed)
env.seed(seed)
@@ -56,12 +59,11 @@ def env_agent_config(cfg,seed=1):
return env, agent
def train(cfg, env, agent):
''' Training
''' 训练
'''
print('Start training!')
print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}')
print("开始训练!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
@@ -69,7 +71,7 @@ def train(cfg, env, agent):
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step += 1
action = agent.choose_action(state) # 选择动作
action = agent.sample(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
@@ -82,27 +84,17 @@ def train(cfg, env, agent):
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('Finish training!')
if (i_ep + 1) % 10 == 0:
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}Epislon{agent.epsilon:.3f}')
print("完成训练!")
env.close()
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
res_dic = {'rewards':rewards}
return res_dic
def test(cfg, env, agent):
print('Start testing!')
print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
print("开始测试!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
@@ -110,7 +102,7 @@ def test(cfg, env, agent):
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
action = agent.predict(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
@@ -118,14 +110,10 @@ def test(cfg, env, agent):
break
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
print('Finish testing')
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
print("完成测试")
env.close()
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
return {'rewards':rewards}
if __name__ == "__main__":
@@ -133,16 +121,14 @@ if __name__ == "__main__":
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
save_args(cfg,path = cfg.result_path) # 保存参数到模型路径上
agent.save(path = cfg.model_path) # 保存模型
save_results(res_dic, tag = 'train', path = cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
env, agent = env_agent_config(cfg) # 也可以不加,加这一行的是为了避免训练之后环境可能会出现问题,因此新建一个环境用于测试
agent.load(path = cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果
path = cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test") # 画出结果

View File

@@ -63,18 +63,18 @@ class MLP(nn.Module):
return self.fc3(x)
class DoubleDQN:
def __init__(self, n_states, n_actions, cfg):
def __init__(self, n_states, n_actions, model, memory, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = torch.device(cfg.device) # 设备cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
self.actions_count = 0
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = model.to(self.device)
self.target_net = model.to(self.device)
# target_net copy from policy_net
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(param.data)
@@ -82,13 +82,13 @@ class DoubleDQN:
# 可查parameters()与state_dict()的区别前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0
self.memory = ReplayBuffer(cfg.memory_capacity)
self.memory = memory
def choose_action(self, state):
def sample(self, state):
'''选择动作
'''
self.actions_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.actions_count / self.epsilon_decay)
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
@@ -104,9 +104,16 @@ class DoubleDQN:
else:
action = random.randrange(self.n_actions)
return action
def predict(self, state):
'''选择动作
'''
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_value = self.policy_net(state)
action = q_value.max(1)[1].item()
return action
def update(self):
if len(self.memory) < self.batch_size:
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
return
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
@@ -150,7 +157,7 @@ class DoubleDQN:
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
def save(self,path):
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')

View File

@@ -1,19 +0,0 @@
{
"algo_name": "DoubleDQN",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.99,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 500,
"lr": 0.0001,
"memory_capacity": 100000,
"batch_size": 64,
"target_update": 2,
"hidden_dim": 256,
"device": "cuda",
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/results/",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/models/",
"save_fig": true
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

View File

@@ -20,31 +20,33 @@ import argparse
from common.utils import save_results,make_dir
from common.utils import plot_rewards,save_args
from common.models import MLP
from common.memories import ReplayBuffer
from DoubleDQN.double_dqn import DoubleDQN
def get_args():
""" Hyperparameters
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=2,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
'/' + curr_time + '/models/' ) # 保存模型的路径
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
@@ -55,19 +57,20 @@ def env_agent_config(cfg,seed=1):
env.seed(seed)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DoubleDQN(n_states,n_actions,cfg)
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
memory = ReplayBuffer(cfg.memory_capacity)
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
return env,agent
def train(cfg,env,agent):
print('Start training!')
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
print("开始训练!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state)
action = agent.sample(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
@@ -78,61 +81,45 @@ def train(cfg,env,agent):
if i_ep % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
if (i_ep+1)%10 == 0:
print(f'Env:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Finish training!')
return {'rewards':rewards,'ma_rewards':ma_rewards}
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}Epislon{agent.epsilon:.3f}')
rewards.append(ep_reward)
print("完成训练!")
return {'rewards':rewards}
def test(cfg,env,agent):
print('Start testing')
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
print("开始测试!")
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.choose_action(state)
action = agent.predict(state)
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}")
print('Finish testing!')
return {'rewards':rewards,'ma_rewards':ma_rewards}
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
print("完成测试!")
return {'rewards':rewards}
if __name__ == "__main__":
cfg = get_args()
print(cfg.device)
# training
env,agent = env_agent_config(cfg,seed=1)
# 训练
env, agent = env_agent_config(cfg,seed=1)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg)
agent.save(path=cfg.model_path)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # 保存参数
agent.save(path=cfg.model_path) # 保存模型
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
# testing
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
res_dic = test(cfg,env,agent)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg,seed=1)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="test")
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-12 16:14:34
LastEditor: John
LastEditTime: 2021-05-05 16:58:39
LastEditTime: 2022-08-15 18:10:13
Discription:
Environment:
'''
@@ -22,11 +22,10 @@ class FisrtVisitMC:
self.epsilon = cfg.epsilon
self.gamma = cfg.gamma
self.Q_table = defaultdict(lambda: np.zeros(n_actions))
self.returns_sum = defaultdict(float) # sum of returns
self.returns_sum = defaultdict(float) # 保存return之和
self.returns_count = defaultdict(float)
def choose_action(self,state):
''' e-greed policy '''
def sample(self,state):
if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
@@ -35,6 +34,15 @@ class FisrtVisitMC:
else:
action = np.random.randint(0,self.n_actions)
return action
def predict(self,state):
if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.argmax(self.Q_table[state])
else:
action = np.random.randint(0,self.n_actions)
return action
def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition
# We convert each state to a tuple so that we can use it as a dict key
@@ -50,16 +58,18 @@ class FisrtVisitMC:
self.returns_sum[sa_pair] += G
self.returns_count[sa_pair] += 1.0
self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]
def save(self,path):
def save(self,path=None):
'''把 Q表格 的数据保存到文件中
'''
from pathlib import Path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(
obj=self.Q_table,
f=path+"Q_table",
pickle_module=dill
)
def load(self, path):
def load(self, path=None):
'''从文件中读取数据到 Q表格
'''
self.Q_table =torch.load(f=path+"Q_table",pickle_module=dill)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "First-Visit MC", "env_name": "Racetrack", "train_eps": 200, "test_eps": 20, "gamma": 0.9, "epsilon": 0.15, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/", "save_fig": true}

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

View File

@@ -0,0 +1,110 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-11 14:26:44
LastEditor: John
LastEditTime: 2022-08-15 18:12:13
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import datetime
import argparse
from common.utils import save_results,save_args,plot_rewards
from MonteCarlo.agent import FisrtVisitMC
from envs.racetrack_env import RacetrackEnv
curr_time = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
def get_args():
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='First-Visit MC',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='Racetrack',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.9,type=float,help="discounted factor")
parser.add_argument('--epsilon',default=0.15,type=float,help="the probability to select a random action")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' )
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
n_actions = env.action_space.n
agent = FisrtVisitMC(n_actions, cfg)
return env,agent
def train(cfg, env, agent):
print("开始训练!")
print(f"环境:{cfg.env_name},算法:{cfg.algo_name},设备:{cfg.device}")
rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
one_ep_transition = []
while True:
action = agent.sample(state)
next_state, reward, done = env.step(action)
ep_reward += reward
one_ep_transition.append((state, action, reward))
state = next_state
if done:
break
rewards.append(ep_reward)
agent.update(one_ep_transition)
if (i_ep+1) % 10 == 0:
print(f"Episode:{i_ep+1}/{cfg.train_eps}: Reward:{ep_reward}")
print("完成训练")
return {'rewards':rewards}
def test(cfg, env, agent):
print("开始测试!")
print(f"环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.predict(state)
next_state, reward, done = env.step(action)
ep_reward += reward
state = next_state
if done:
break
rewards.append(ep_reward)
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
return {'rewards':rewards}
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg.result_path) # 保存参数到模型路径上
agent.save(path = cfg.model_path) # 保存模型
save_results(res_dic, tag = 'train', path = cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train")
# 测试
env, agent = env_agent_config(cfg) # 也可以不加,加这一行的是为了避免训练之后环境可能会出现问题,因此新建一个环境用于测试
agent.load(path = cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test") # 画出结果

View File

@@ -1,118 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-11 14:26:44
LastEditor: John
LastEditTime: 2021-05-05 17:27:50
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(__file__)
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import torch
import datetime
from common.utils import save_results,make_dir
from common.plot import plot_rewards
from MonteCarlo.agent import FisrtVisitMC
from envs.racetrack_env import RacetrackEnv
curr_time = datetime.datetime.now().strftime(
"%Y%m%d-%H%M%S") # obtain current time
class MCConfig:
def __init__(self):
self.algo = "MC" # name of algo
self.env = 'Racetrack'
self.result_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/results/' # path to save results
self.model_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/models/' # path to save models
# epsilon: The probability to select a random action .
self.epsilon = 0.15
self.gamma = 0.9 # gamma: Gamma discount factor.
self.train_eps = 200
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check gpu
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
n_actions = 9
agent = FisrtVisitMC(n_actions, cfg)
return env,agent
def train(cfg, env, agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = [] # moving average rewards
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
one_ep_transition = []
while True:
action = agent.choose_action(state)
next_state, reward, done = env.step(action)
ep_reward += reward
one_ep_transition.append((state, action, reward))
state = next_state
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
agent.update(one_ep_transition)
if (i_ep+1) % 10 == 0:
print(f"Episode:{i_ep+1}/{cfg.train_eps}: Reward:{ep_reward}")
print('Complete training')
return rewards, ma_rewards
def eval(cfg, env, agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = [] # moving average rewards
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
while True:
action = agent.choose_action(state)
next_state, reward, done = env.step(action)
ep_reward += reward
state = next_state
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
if (i_ep+1) % 10 == 0:
print(f"Episode:{i_ep+1}/{cfg.train_eps}: Reward:{ep_reward}")
return rewards, ma_rewards
if __name__ == "__main__":
cfg = MCConfig()
# train
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=cfg.result_path)
# eval
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

View File

@@ -0,0 +1,15 @@
{
"algo_name": "Q-learning",
"env_name": "CliffWalking-v0",
"train_eps": 400,
"test_eps": 20,
"gamma": 0.9,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 300,
"lr": 0.1,
"device": "cpu",
"result_path": "/root/Desktop/rl-tutorials/codes/QLearning/outputs/CliffWalking-v0/20220802-163256/results/",
"model_path": "/root/Desktop/rl-tutorials/codes/QLearning/outputs/CliffWalking-v0/20220802-163256/models/",
"save_fig": true
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

View File

@@ -15,18 +15,20 @@ import torch
from collections import defaultdict
class QLearning(object):
def __init__(self,n_states,
def __init__(self,
n_actions,cfg):
self.n_actions = n_actions
self.lr = cfg.lr # 学习率
self.gamma = cfg.gamma
self.epsilon = 0
self.epsilon = cfg.epsilon_start
self.sample_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值Q值的映射即Q表
def choose_action(self, state):
def sample(self, state):
''' 采样动作,训练时用
'''
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的这里选择指数递减
@@ -37,6 +39,8 @@ class QLearning(object):
action = np.random.choice(self.n_actions) # 随机选择动作
return action
def predict(self,state):
''' 预测或选择动作,测试时用
'''
action = np.argmax(self.Q_table[str(state)])
return action
def update(self, state, action, reward, next_state, done):

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2022-06-21 19:36:05
LastEditTime: 2022-08-10 11:25:56
Discription:
Environment:
'''
@@ -18,54 +18,45 @@ sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from env.gridworld_env import CliffWalkingWapper
import argparse
from envs.gridworld_env import CliffWalkingWapper
from qlearning import QLearning
from common.utils import plot_rewards
from common.utils import plot_rewards,save_args
from common.utils import save_results,make_dir
def get_args():
"""
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training") # 训练的回合数
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor") # 折扣因子
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/',type=str )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/',type=str,help="path to save models")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'Q-learning' # 算法名称
self.env_name = 'CliffWalking-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 400 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.90 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
self.lr = 0.1 # 学习率
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录奖励
ma_rewards = [] # 记录滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合
while True:
action = agent.choose_action(state) # 根据算法选择一个动作
action = agent.sample(state) # 根据算法采样一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
agent.update(state, action, reward, next_state, done) # Q学习算法更新
state = next_state # 更新状态
@@ -73,19 +64,14 @@ def train(cfg,env,agent):
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print('完成训练!')
return rewards,ma_rewards
return {"rewards":rewards}
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 滑动平均的奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录每个episode的reward
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
@@ -97,13 +83,9 @@ def test(cfg,env,agent):
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
return {"rewards":rewards}
def env_agent_config(cfg,seed=1):
'''创建环境和智能体
@@ -119,23 +101,27 @@ def env_agent_config(cfg,seed=1):
env.seed(seed) # 设置随机种子
n_states = env.observation_space.n # 状态维度
n_actions = env.action_space.n # 动作维度
agent = QLearning(n_states,n_actions,cfg)
print(f"状态数:{n_states},动作数:{n_actions}")
agent = QLearning(n_actions,cfg)
return env,agent
if __name__ == "__main__":
cfg = Config()
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg, seed=10)
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test', path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 49 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "Sarsa", "env_name": "CliffWalking-v0", "train_eps": 300, "test_eps": 20, "ep_max_steps": 200, "gamma": 0.99, "epsilon_start": 0.9, "epsilon_end": 0.01, "epsilon_decay": 200, "lr": 0.2, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220803-142740/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220803-142740/models/", "save_fig": true}

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -0,0 +1,15 @@
{
"algo_name": "Sarsa",
"env_name": "CliffWalking-v0",
"train_eps": 400,
"test_eps": 20,
"gamma": 0.9,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 300,
"lr": 0.1,
"device": "cpu",
"result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\Sarsa/outputs/CliffWalking-v0/20220804-223029/results/",
"model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\Sarsa/outputs/CliffWalking-v0/20220804-223029/models/",
"save_fig": true
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-12 16:58:16
LastEditor: John
LastEditTime: 2022-04-29 20:12:57
LastEditTime: 2022-08-04 22:22:16
Discription:
Environment:
'''
@@ -15,7 +15,7 @@ import torch
import math
class Sarsa(object):
def __init__(self,
n_actions,cfg,):
n_actions,cfg):
self.n_actions = n_actions
self.lr = cfg.lr
self.gamma = cfg.gamma
@@ -24,7 +24,7 @@ class Sarsa(object):
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.Q = defaultdict(lambda: np.zeros(n_actions)) # Q table
def choose_action(self, state):
def sample(self, state):
self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay) # The probability to select a random action, is is log decayed
@@ -33,14 +33,14 @@ class Sarsa(object):
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
return action
def predict_action(self,state):
def predict(self,state):
return np.argmax(self.Q[state])
def update(self, state, action, reward, next_state, next_action,done):
Q_predict = self.Q[state][action]
if done:
Q_target = reward # terminal state
Q_target = reward # 终止状态
else:
Q_target = reward + self.gamma * self.Q[next_state][next_action]
Q_target = reward + self.gamma * self.Q[next_state][next_action] # 与Q learning不同Sarsa是拿下一步动作对应的Q值去更新
self.Q[state][action] += self.lr * (Q_target - Q_predict)
def save(self,path):
'''把 Q表格 的数据保存到文件中

View File

@@ -5,115 +5,114 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-11 17:59:16
LastEditor: John
LastEditTime: 2022-04-29 20:18:13
LastEditTime: 2022-08-04 22:28:51
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path of file
parent_path = os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import datetime
import torch
import argparse
from envs.racetrack_env import RacetrackEnv
from Sarsa.sarsa import Sarsa
from common.utils import save_results,make_dir,plot_rewards
from common.utils import save_results,make_dir,plot_rewards,save_args
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
def get_args():
""" 超参数
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") # 训练的回合数
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
parser.add_argument('--ep_max_steps',default=200,type=int) # 每回合最大的部署
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") # 折扣因子
parser.add_argument('--epsilon_start',default=0.90,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
parser.add_argument('--epsilon_decay',default=200,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
parser.add_argument('--lr',default=0.2,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
return args
class Config:
''' parameters for Sarsa
'''
def __init__(self):
self.algo_name = 'Qlearning'
self.env_name = 'CliffWalking-v0' # 0 up, 1 right, 2 down, 3 left
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check GPU
self.result_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/results/' # path to save results
self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models
self.train_eps = 300 # training episodes
self.test_eps = 20 # testing episodes
self.n_steps = 200 # maximum steps per episode
self.epsilon_start = 0.90 # start value of epsilon
self.epsilon_end = 0.01 # end value of epsilon
self.epsilon_decay = 200 # decay rate of epsilon
self.gamma = 0.99 # gamma: Gamma discount factor.
self.lr = 0.2 # learning rate: step size parameter
self.save = True # if save figures
def env_agent_config(cfg,seed=1):
env = RacetrackEnv()
n_states = 9 # number of actions
agent = Sarsa(n_states,cfg)
n_actions = 9 # 动作数
agent = Sarsa(n_actions,cfg)
return env,agent
def train(cfg,env,agent):
rewards = []
ma_rewards = []
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
action = agent.choose_action(state)
action = agent.sample(state)
ep_reward = 0
# while True:
for _ in range(cfg.n_steps):
for _ in range(cfg.ep_max_steps):
next_state, reward, done = env.step(action)
ep_reward+=reward
next_action = agent.choose_action(next_state)
next_action = agent.sample(next_state)
agent.update(state, action, reward, next_state, next_action,done)
state = next_state
action = next_action
if done:
break
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
rewards.append(ep_reward)
if (i_ep+1)%2==0:
print(f"Episode:{i_ep+1}, Reward:{ep_reward}, Epsilon:{agent.epsilon}")
return rewards,ma_rewards
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print('完成训练!')
return {"rewards":rewards}
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
# Print out which episode we're on, useful for debugging.
# Generate an episode.
# An episode is an array of (state, action, reward) tuples
state = env.reset()
ep_reward = 0
while True:
# for _ in range(cfg.n_steps):
action = agent.predict_action(state)
# while True:
for _ in range(cfg.ep_max_steps):
action = agent.predict(state)
next_state, reward, done = env.step(action)
ep_reward+=reward
state = next_state
if done:
break
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
rewards.append(ep_reward)
if (i_ep+1)%1==0:
print("Episode:{}/{}: Reward:{}".format(i_ep+1, cfg.test_eps,ep_reward))
print('Complete testing')
return rewards,ma_rewards
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return {"rewards":rewards}
if __name__ == "__main__":
cfg = Config()
env,agent = env_agent_config(cfg,seed=1)
rewards,ma_rewards = train(cfg,env,agent)
make_dir(cfg.result_path,cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train")
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="test")
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2022-08-04 22:44:00
Discription:
Environment:
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import argparse
from envs.gridworld_env import CliffWalkingWapper
from Sarsa.sarsa import Sarsa
from common.utils import plot_rewards,save_args
from common.utils import save_results,make_dir
def get_args():
"""
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training") # 训练的回合数
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor") # 折扣因子
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args([])
return args
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合
action = agent.sample(state)
while True:
action = agent.sample(state) # 根据算法采样一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
next_action = agent.sample(next_state)
agent.update(state, action, reward, next_state, next_action,done) # 算法更新
state = next_state # 更新状态
action = next_action
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print('完成训练!')
return {"rewards":rewards}
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录每个episode的reward
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
while True:
action = agent.predict(state) # 根据算法选择一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
state = next_state # 更新状态
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return {"rewards":rewards}
def env_agent_config(cfg,seed=1):
'''创建环境和智能体
Args:
cfg ([type]): [description]
seed (int, optional): 随机种子. Defaults to 1.
Returns:
env [type]: 环境
agent : 智能体
'''
env = gym.make(cfg.env_name)
env = CliffWalkingWapper(env)
env.seed(seed) # 设置随机种子
n_states = env.observation_space.n # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"状态数:{n_states},动作数:{n_actions}")
agent = Sarsa(n_actions,cfg)
return env,agent
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-12 16:02:24
LastEditor: John
LastEditTime: 2022-07-31 23:18:04
LastEditTime: 2022-08-15 18:11:27
Discription:
Environment:
'''
@@ -42,21 +42,36 @@ def plot_rewards_cn(rewards, ma_rewards, cfg, tag='train'):
if cfg.save:
plt.savefig(cfg.result_path+f"{tag}_rewards_curve_cn")
# plt.show()
def smooth(data, weight=0.9):
'''用于平滑曲线类似于Tensorboard中的smooth
Args:
data (List):输入数据
weight (Float): 平滑权重处于0-1之间数值越高说明越平滑一般取0.9
def plot_rewards(rewards, ma_rewards, cfg, tag='train'):
Returns:
smoothed (List): 平滑后的数据
'''
last = data[0] # First value in the plot (first timestep)
smoothed = list()
for point in data:
smoothed_val = last * weight + (1 - weight) * point # 计算平滑值
smoothed.append(smoothed_val)
last = smoothed_val
return smoothed
def plot_rewards(rewards,cfg,path=None,tag='train'):
sns.set()
plt.figure() # 创建一个图形实例,方便同时多画几个图
plt.title("learning curve on {} of {} for {}".format(
cfg.device, cfg.algo_name, cfg.env_name))
plt.title(f"{tag}ing curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}")
plt.xlabel('epsiodes')
plt.plot(rewards, label='rewards')
plt.plot(ma_rewards, label='ma rewards')
plt.plot(smooth(rewards), label='smoothed')
plt.legend()
if cfg.save_fig:
plt.savefig(cfg.result_path+"{}_rewards_curve".format(tag))
plt.show()
plt.savefig(f"{path}/{tag}ing_curve.png")
if cfg.show_fig:
plt.show()
def plot_losses(losses, algo="DQN", save=True, path='./'):
sns.set()
@@ -69,19 +84,13 @@ def plot_losses(losses, algo="DQN", save=True, path='./'):
plt.savefig(path+"losses_curve")
plt.show()
def save_results(dic, tag='train', path='./results'):
def save_results(dic, tag='train', path = None):
''' 保存奖励
'''
Path(path).mkdir(parents=True, exist_ok=True)
for key,value in dic.items():
np.save(path+'{}_{}.npy'.format(tag,key),value)
print('Results saved')
# def save_results(rewards, ma_rewards, tag='train', path='./results'):
# ''' 保存奖励
# '''
# np.save(path+'{}_rewards.npy'.format(tag), rewards)
# np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards)
# print('Result saved!')
def make_dir(*paths):
@@ -100,27 +109,10 @@ def del_empty_dir(*paths):
if not os.listdir(os.path.join(path, dir)):
os.removedirs(os.path.join(path, dir))
def save_args(args):
# save parameters
args_dict = vars(args)
with open(args.result_path+'params.json', 'w') as fp:
def save_args(args,path=None):
# 保存参数
args_dict = vars(args)
Path(path).mkdir(parents=True, exist_ok=True)
with open(f"{path}/params.json", 'w') as fp:
json.dump(args_dict, fp)
print("Parameters saved!")
def smooth(data, weight=0.9):
'''_summary_
Args:
data (List):输入数据
weight (Float): 平滑权重处于0-1之间数值越高说明越平滑一般取0.9
Returns:
smoothed (List): 平滑后的数据
'''
last = data[0] # First value in the plot (first timestep)
smoothed = list()
for point in data:
smoothed_val = last * weight + (1 - weight) * point # 计算平滑值
smoothed.append(smoothed_val)
last = smoothed_val
return smoothed
print("参数已保存!")

View File

@@ -1,26 +1,9 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def GridWorld(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']

View File

@@ -4,6 +4,7 @@
# This file contains code for the racetrack environment that you will be using
# as part of the second part of the CM50270: Reinforcement Learning coursework.
import imp
import time
import random
import numpy as np
@@ -11,7 +12,7 @@ import os
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
from IPython.display import clear_output
from gym.spaces import Discrete
from matplotlib import colors
class RacetrackEnv(object) :
@@ -61,7 +62,7 @@ class RacetrackEnv(object) :
if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
self.initial_states.append((y, x))
self.action_space = Discrete(9)
self.is_reset = False
#print("Racetrack Environment File Loaded Successfully.")

Some files were not shown because too many files have changed in this diff Show More