Merge branch 'master' of github.com:datawhalechina/easy-rl
318
projects/PARL/DQN.ipynb
Normal file
@@ -0,0 +1,318 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 定义模型\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import paddle\n",
|
||||
"import paddle.nn as nn\n",
|
||||
"import paddle.nn.functional as F\n",
|
||||
"import parl\n",
|
||||
"\n",
|
||||
"class CartpoleModel(parl.Model):\n",
|
||||
" \"\"\" Linear network to solve Cartpole problem.\n",
|
||||
" Args:\n",
|
||||
" n_states (int): Dimension of observation space.\n",
|
||||
" n_actions (int): Dimension of action space.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, n_states, n_actions):\n",
|
||||
" super(CartpoleModel, self).__init__()\n",
|
||||
" hid1_size = 128\n",
|
||||
" hid2_size = 128\n",
|
||||
" self.fc1 = nn.Linear(n_states, hid1_size)\n",
|
||||
" self.fc2 = nn.Linear(hid1_size, hid2_size)\n",
|
||||
" self.fc3 = nn.Linear(hid2_size, n_actions)\n",
|
||||
"\n",
|
||||
" def forward(self, obs):\n",
|
||||
" h1 = F.relu(self.fc1(obs))\n",
|
||||
" h2 = F.relu(self.fc2(h1))\n",
|
||||
" Q = self.fc3(h2)\n",
|
||||
" return Q"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import parl\n",
|
||||
"import paddle\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CartpoleAgent(parl.Agent):\n",
|
||||
" \"\"\"Agent of Cartpole env.\n",
|
||||
" Args:\n",
|
||||
" algorithm(parl.Algorithm): algorithm used to solve the problem.\n",
|
||||
" \"\"\"\n",
|
||||
"\n",
|
||||
" def __init__(self, algorithm, n_actions, e_greed=0.1, e_greed_decrement=0):\n",
|
||||
" super(CartpoleAgent, self).__init__(algorithm)\n",
|
||||
" assert isinstance(n_actions, int)\n",
|
||||
" self.n_actions = n_actions\n",
|
||||
"\n",
|
||||
" self.global_step = 0\n",
|
||||
" self.update_target_steps = 200\n",
|
||||
"\n",
|
||||
" self.e_greed = e_greed\n",
|
||||
" self.e_greed_decrement = e_greed_decrement\n",
|
||||
"\n",
|
||||
" def sample(self, obs):\n",
|
||||
" \"\"\"Sample an action `for exploration` when given an observation\n",
|
||||
" Args:\n",
|
||||
" obs(np.float32): shape of (n_states,)\n",
|
||||
" Returns:\n",
|
||||
" act(int): action\n",
|
||||
" \"\"\"\n",
|
||||
" sample = np.random.random()\n",
|
||||
" if sample < self.e_greed:\n",
|
||||
" act = np.random.randint(self.n_actions)\n",
|
||||
" else:\n",
|
||||
" if np.random.random() < 0.01:\n",
|
||||
" act = np.random.randint(self.n_actions)\n",
|
||||
" else:\n",
|
||||
" act = self.predict(obs)\n",
|
||||
" self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)\n",
|
||||
" return act\n",
|
||||
"\n",
|
||||
" def predict(self, obs):\n",
|
||||
" \"\"\"Predict an action when given an observation\n",
|
||||
" Args:\n",
|
||||
" obs(np.float32): shape of (n_states,)\n",
|
||||
" Returns:\n",
|
||||
" act(int): action\n",
|
||||
" \"\"\"\n",
|
||||
" obs = paddle.to_tensor(obs, dtype='float32')\n",
|
||||
" pred_q = self.alg.predict(obs)\n",
|
||||
" act = pred_q.argmax().numpy()[0]\n",
|
||||
" return act\n",
|
||||
"\n",
|
||||
" def learn(self, obs, act, reward, next_obs, terminal):\n",
|
||||
" \"\"\"Update model with an episode data\n",
|
||||
" Args:\n",
|
||||
" obs(np.float32): shape of (batch_size, n_states)\n",
|
||||
" act(np.int32): shape of (batch_size)\n",
|
||||
" reward(np.float32): shape of (batch_size)\n",
|
||||
" next_obs(np.float32): shape of (batch_size, n_states)\n",
|
||||
" terminal(np.float32): shape of (batch_size)\n",
|
||||
" Returns:\n",
|
||||
" loss(float)\n",
|
||||
" \"\"\"\n",
|
||||
" if self.global_step % self.update_target_steps == 0:\n",
|
||||
" self.alg.sync_target()\n",
|
||||
" self.global_step += 1\n",
|
||||
"\n",
|
||||
" act = np.expand_dims(act, axis=-1)\n",
|
||||
" reward = np.expand_dims(reward, axis=-1)\n",
|
||||
" terminal = np.expand_dims(terminal, axis=-1)\n",
|
||||
"\n",
|
||||
" obs = paddle.to_tensor(obs, dtype='float32')\n",
|
||||
" act = paddle.to_tensor(act, dtype='int32')\n",
|
||||
" reward = paddle.to_tensor(reward, dtype='float32')\n",
|
||||
" next_obs = paddle.to_tensor(next_obs, dtype='float32')\n",
|
||||
" terminal = paddle.to_tensor(terminal, dtype='float32')\n",
|
||||
" loss = self.alg.learn(obs, act, reward, next_obs, terminal)\n",
|
||||
" return loss.numpy()[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import gym\n",
|
||||
"import numpy as np\n",
|
||||
"import parl\n",
|
||||
"\n",
|
||||
"from parl.utils import logger, ReplayMemory\n",
|
||||
"from parl.algorithms import DQN\n",
|
||||
"\n",
|
||||
"LEARN_FREQ = 5 # training frequency\n",
|
||||
"MEMORY_SIZE = 200000\n",
|
||||
"MEMORY_WARMUP_SIZE = 200\n",
|
||||
"BATCH_SIZE = 64\n",
|
||||
"LEARNING_RATE = 0.0005\n",
|
||||
"GAMMA = 0.99\n",
|
||||
"\n",
|
||||
"# train an episode\n",
|
||||
"def run_train_episode(agent, env, rpm):\n",
|
||||
" total_reward = 0\n",
|
||||
" obs = env.reset()\n",
|
||||
" step = 0\n",
|
||||
" while True:\n",
|
||||
" step += 1\n",
|
||||
" action = agent.sample(obs)\n",
|
||||
" next_obs, reward, done, _ = env.step(action)\n",
|
||||
" rpm.append(obs, action, reward, next_obs, done)\n",
|
||||
"\n",
|
||||
" # train model\n",
|
||||
" if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):\n",
|
||||
" # s,a,r,s',done\n",
|
||||
" (batch_obs, batch_action, batch_reward, batch_next_obs,\n",
|
||||
" batch_done) = rpm.sample_batch(BATCH_SIZE)\n",
|
||||
" train_loss = agent.learn(batch_obs, batch_action, batch_reward,\n",
|
||||
" batch_next_obs, batch_done)\n",
|
||||
"\n",
|
||||
" total_reward += reward\n",
|
||||
" obs = next_obs\n",
|
||||
" if done:\n",
|
||||
" break\n",
|
||||
" return total_reward\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# evaluate 5 episodes\n",
|
||||
"def run_evaluate_episodes(agent, env, eval_episodes=5, render=False):\n",
|
||||
" eval_reward = []\n",
|
||||
" for i in range(eval_episodes):\n",
|
||||
" obs = env.reset()\n",
|
||||
" episode_reward = 0\n",
|
||||
" while True:\n",
|
||||
" action = agent.predict(obs)\n",
|
||||
" obs, reward, done, _ = env.step(action)\n",
|
||||
" episode_reward += reward\n",
|
||||
" if render:\n",
|
||||
" env.render()\n",
|
||||
" if done:\n",
|
||||
" break\n",
|
||||
" eval_reward.append(episode_reward)\n",
|
||||
" return np.mean(eval_reward)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def main(args):\n",
|
||||
" env = gym.make('CartPole-v0')\n",
|
||||
" n_states = env.observation_space.shape[0]\n",
|
||||
" n_actions = env.action_space.n\n",
|
||||
" logger.info('n_states {}, n_actions {}'.format(n_states, n_actions))\n",
|
||||
"\n",
|
||||
" # set action_shape = 0 while in discrete control environment\n",
|
||||
" rpm = ReplayMemory(MEMORY_SIZE, n_states, 0)\n",
|
||||
"\n",
|
||||
" # build an agent\n",
|
||||
" model = CartpoleModel(n_states=n_states, n_actions=n_actions)\n",
|
||||
" alg = DQN(model, gamma=GAMMA, lr=LEARNING_RATE)\n",
|
||||
" agent = CartpoleAgent(\n",
|
||||
" alg, n_actions=n_actions, e_greed=0.1, e_greed_decrement=1e-6)\n",
|
||||
"\n",
|
||||
" # warmup memory\n",
|
||||
" while len(rpm) < MEMORY_WARMUP_SIZE:\n",
|
||||
" run_train_episode(agent, env, rpm)\n",
|
||||
"\n",
|
||||
" max_episode = args.max_episode\n",
|
||||
"\n",
|
||||
" # start training\n",
|
||||
" episode = 0\n",
|
||||
" while episode < max_episode:\n",
|
||||
" # train part\n",
|
||||
" for i in range(50):\n",
|
||||
" total_reward = run_train_episode(agent, env, rpm)\n",
|
||||
" episode += 1\n",
|
||||
"\n",
|
||||
" # test part\n",
|
||||
" eval_reward = run_evaluate_episodes(agent, env, render=False)\n",
|
||||
" logger.info('episode:{} e_greed:{} Test reward:{}'.format(\n",
|
||||
" episode, agent.e_greed, eval_reward))\n",
|
||||
"\n",
|
||||
" # save the parameters to ./model.ckpt\n",
|
||||
" save_path = './model.ckpt'\n",
|
||||
" agent.save(save_path)\n",
|
||||
"\n",
|
||||
" # save the model and parameters of policy network for inference\n",
|
||||
" save_inference_path = './inference_model'\n",
|
||||
" input_shapes = [[None, env.observation_space.shape[0]]]\n",
|
||||
" input_dtypes = ['float32']\n",
|
||||
" agent.save_inference_model(save_inference_path, input_shapes, input_dtypes)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:64]\u001b[0m obs_dim 4, act_dim 2\n",
|
||||
"\u001b[32m[08-01 21:48:19 MainThread @3996942455.py:92]\u001b[0m episode:50 e_greed:0.0988929999999989 Test reward:18.4\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:100 e_greed:0.09794799999999795 Test reward:9.6\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:150 e_greed:0.0973899999999974 Test reward:37.8\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:200 e_greed:0.09684299999999685 Test reward:8.8\n",
|
||||
"\u001b[32m[08-01 21:48:20 MainThread @3996942455.py:92]\u001b[0m episode:250 e_greed:0.09635499999999636 Test reward:9.4\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:300 e_greed:0.09585299999999586 Test reward:9.2\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:350 e_greed:0.09535799999999536 Test reward:9.2\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:400 e_greed:0.09486399999999487 Test reward:10.0\n",
|
||||
"\u001b[32m[08-01 21:48:21 MainThread @3996942455.py:92]\u001b[0m episode:450 e_greed:0.09435299999999436 Test reward:9.2\n",
|
||||
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:500 e_greed:0.09384899999999385 Test reward:9.4\n",
|
||||
"\u001b[32m[08-01 21:48:22 MainThread @3996942455.py:92]\u001b[0m episode:550 e_greed:0.09302299999999303 Test reward:69.0\n",
|
||||
"\u001b[32m[08-01 21:48:25 MainThread @3996942455.py:92]\u001b[0m episode:600 e_greed:0.08774199999998775 Test reward:141.2\n",
|
||||
"\u001b[32m[08-01 21:48:30 MainThread @3996942455.py:92]\u001b[0m episode:650 e_greed:0.0791019999999791 Test reward:184.0\n",
|
||||
"\u001b[32m[08-01 21:48:35 MainThread @3996942455.py:92]\u001b[0m episode:700 e_greed:0.07011299999997012 Test reward:182.0\n",
|
||||
"\u001b[32m[08-01 21:48:40 MainThread @3996942455.py:92]\u001b[0m episode:750 e_greed:0.06089099999996089 Test reward:197.4\n",
|
||||
"\u001b[32m[08-01 21:48:45 MainThread @3996942455.py:92]\u001b[0m episode:800 e_greed:0.05139199999995139 Test reward:183.4\n",
|
||||
"\u001b[32m[08-01 21:48:50 MainThread @3996942455.py:92]\u001b[0m episode:850 e_greed:0.042255999999942256 Test reward:153.0\n",
|
||||
"\u001b[32m[08-01 21:48:55 MainThread @3996942455.py:92]\u001b[0m episode:900 e_greed:0.033495999999933496 Test reward:192.6\n",
|
||||
"\u001b[32m[08-01 21:49:00 MainThread @3996942455.py:92]\u001b[0m episode:950 e_greed:0.024318999999924318 Test reward:166.6\n",
|
||||
"\u001b[32m[08-01 21:49:06 MainThread @3996942455.py:92]\u001b[0m episode:1000 e_greed:0.014873999999916176 Test reward:187.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import argparse\n",
|
||||
"parser = argparse.ArgumentParser()\n",
|
||||
"parser.add_argument(\n",
|
||||
" '--max_episode',\n",
|
||||
" type=int,\n",
|
||||
" default=1000,\n",
|
||||
" help='stop condition: number of max episode')\n",
|
||||
"args = parser.parse_args(args=[])\n",
|
||||
"\n",
|
||||
"main(args)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.7.12 ('rl_tutorials')",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.12"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "4f613f1ab80ec98dc1b91d6e720de51301598a187317378e53e49b773c1123dd"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
11
projects/PARL/README.md
Normal file
@@ -0,0 +1,11 @@
|
||||
[PARL](https://github.com/PaddlePaddle/PARL)是一个高性能、灵活的强化学习框架,由百度AI Studio开发。
|
||||
|
||||
## 安装
|
||||
|
||||
1. 安装parl,参考[PARL Github](https://github.com/PaddlePaddle/PARL)
|
||||
2. 安装paddlepaddle:```pip install paddlepaddle```
|
||||
|
||||
## 常见问题
|
||||
|
||||
```jupyter-client 7.3.1 requires pyzmq>=22.3, but you have pyzmq 18.1.1 which is incompatible.```:
|
||||
```pip install -U pyzmq```
|
||||
@@ -1,49 +1,34 @@
|
||||
## 0、写在前面
|
||||
|
||||
本项目用于学习RL基础算法,尽量做到: **注释详细**(经过很长时间的纠结,还是中文注释好了!!!),**结构清晰**。
|
||||
本项目用于学习RL基础算法,主要面向对象为RL初学者、需要结合RL的非专业学习者,尽量做到: **(中文)注释详细**,**结构清晰**。
|
||||
|
||||
代码结构主要分为以下几个脚本:
|
||||
注意本项目为实战内容,建议首先掌握相关算法的一些理论基础,再来享用本项目,理论教程参考本人参与编写的[蘑菇书](https://github.com/datawhalechina/easy-rl)。
|
||||
|
||||
未来开发计划包括但不限于:多智能体算法、强化学习Python包以及强化学习图形化编程平台等等。
|
||||
|
||||
## 1、项目说明
|
||||
|
||||
项目内容主要包含以下几个部分:
|
||||
* [Jupyter Notebook](./notebooks/):使用Notebook写的算法,有比较详细的实战引导,推荐新手食用
|
||||
* [codes](./assets/):这些是基于Python脚本写的算法,风格比较接近实际项目的写法,推荐有一定代码基础的人阅读,下面会说明其具体的一些架构
|
||||
* [parl](./PARL/):应业务需求,写了一些基于百度飞浆平台和```parl```模块的RL实例
|
||||
* [附件](./assets/):目前包含强化学习各算法的中文伪代码
|
||||
|
||||
|
||||
[codes](./assets/)结构主要分为以下几个脚本:
|
||||
* ```[algorithm_name].py```:即保存算法的脚本,例如```dqn.py```,每种算法都会有一定的基础模块,例如```Replay Buffer```、```MLP```(多层感知机)等等;
|
||||
* ```task.py```: 即保存任务的脚本,基本包括基于```argparse```模块的参数,训练以及测试函数等等;
|
||||
* ```utils.py```:该脚本用于保存诸如存储结果以及画图的软件,在实际项目或研究中,推荐大家使用```Tensorboard```来保存结果,然后使用诸如```matplotlib```以及```seabron```来进一步画图。
|
||||
|
||||
## 运行环境
|
||||
## 2、运行环境
|
||||
|
||||
python 3.7、pytorch 1.6.0-1.9.0、gym 0.21.0
|
||||
|
||||
或者在```README.md```目录下执行以下命令复现环境:
|
||||
在项目根目录下执行以下命令复现环境:
|
||||
```bash
|
||||
conda env create -f environment.yaml
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
## 使用说明
|
||||
## 3、使用说明
|
||||
|
||||
直接运行带有```train```的py文件或ipynb文件会进行训练默认的任务;
|
||||
也可以运行带有```task```的py文件训练不同的任务
|
||||
|
||||
## 内容导航
|
||||
|
||||
| 算法名称 | 相关论文材料 | 环境 | 备注 |
|
||||
| :--------------------------------------: | :----------------------------------------------------------: | ----------------------------------------- | :--------------------------------: |
|
||||
| [On-Policy First-Visit MC](./MonteCarlo) | [medium blog](https://medium.com/analytics-vidhya/monte-carlo-methods-in-reinforcement-learning-part-1-on-policy-methods-1f004d59686a) | [Racetrack](./envs/racetrack_env.md) | |
|
||||
| [Q-Learning](./QLearning) | [towardsdatascience blog](https://towardsdatascience.com/simple-reinforcement-learning-q-learning-fcddc4b6fe56),[q learning paper](https://ieeexplore.ieee.org/document/8836506) | [CliffWalking-v0](./envs/gym_info.md) | |
|
||||
| [Sarsa](./Sarsa) | [geeksforgeeks blog](https://www.geeksforgeeks.org/sarsa-reinforcement-learning/) | [Racetrack](./envs/racetrack_env.md) | |
|
||||
| [DQN](./DQN) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),[Nature DQN Paper](https://www.nature.com/articles/nature14236) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
|
||||
| [DoubleDQN](./DoubleDQN) | [DoubleDQN Paper](https://arxiv.org/abs/1509.06461) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [A2C](./A2C) | [A3C Paper](https://arxiv.org/abs/1602.01783) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [SAC](./SoftActorCritic) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
|
||||
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
|
||||
| [DDPG](./DDPG) | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
|
||||
| [TD3](./TD3) | [TD3 Paper](https://arxiv.org/abs/1802.09477) | [HalfCheetah-v2]((./envs/mujoco_info.md)) | |
|
||||
|
||||
|
||||
## Refs
|
||||
|
||||
[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2)
|
||||
|
||||
[RL-Adventure](https://github.com/higgsfield/RL-Adventure)
|
||||
|
||||
[Google 开源项目风格指南——中文版](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)
|
||||
4
projects/assets/pseudocodes/pseudocodes.aux
Normal file
@@ -0,0 +1,4 @@
|
||||
\relax
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{1}{}\protected@file@percent }
|
||||
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{2}{}\protected@file@percent }
|
||||
\gdef \@abspage@last{2}
|
||||
398
projects/assets/pseudocodes/pseudocodes.log
Normal file
@@ -0,0 +1,398 @@
|
||||
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 15 AUG 2022 15:05
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
file:line:error style messages enabled.
|
||||
%&-line parsing enabled.
|
||||
**/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes
|
||||
(/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes.tex
|
||||
LaTeX2e <2020-10-01> patch level 4
|
||||
L3 programming layer <2021-02-18> (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexart.cls (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexbackend.cfg
|
||||
File: ctexbackend.cfg 2021/03/14 v2.5.6 Backend configuration file (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3kernel/expl3.sty
|
||||
Package: expl3 2021-02-18 L3 programming layer (loader)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
|
||||
File: l3backend-xetex.def 2021-03-18 L3 backend support: XeTeX
|
||||
(|extractbb --version)
|
||||
\c__kernel_sys_dvipdfmx_version_int=\count175
|
||||
\l__color_backend_stack_int=\count176
|
||||
\g__color_backend_stack_int=\count177
|
||||
\g__graphics_track_int=\count178
|
||||
\l__pdf_internal_box=\box47
|
||||
\g__pdf_backend_object_int=\count179
|
||||
\g__pdf_backend_annotation_int=\count180
|
||||
\g__pdf_backend_link_int=\count181
|
||||
))
|
||||
Document Class: ctexart 2021/03/14 v2.5.6 Chinese adapter for class article (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-2020-10-01.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-generic.tex))) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.sty
|
||||
Package: l3keys2e 2021-03-12 LaTeX2e option processing using LaTeX3 keys
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexhook.sty
|
||||
Package: ctexhook 2021/03/14 v2.5.6 Document and package hooks (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexpatch.sty
|
||||
Package: ctexpatch 2021/03/14 v2.5.6 Patching commands (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/fix-cm.sty
|
||||
Package: fix-cm 2015/01/14 v1.1t fixes to LaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/ts1enc.def
|
||||
File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
|
||||
LaTeX Font Info: Redeclaring font encoding TS1 on input line 47.
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel.sty
|
||||
Package: everysel 2021/01/20 v2.1 EverySelectfont Package (MS)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel-2011-10-28.sty))
|
||||
\l__ctex_tmp_int=\count182
|
||||
\l__ctex_tmp_box=\box48
|
||||
\l__ctex_tmp_dim=\dimen138
|
||||
\g__ctex_section_depth_int=\count183
|
||||
\g__ctex_font_size_int=\count184
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexopts.cfg
|
||||
File: ctexopts.cfg 2021/03/14 v2.5.6 Option configuration file (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/article.cls
|
||||
Document Class: article 2020/04/10 v1.4m Standard LaTeX document class
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/size11.clo
|
||||
File: size11.clo 2020/04/10 v1.4m Standard LaTeX file (size option)
|
||||
)
|
||||
\c@part=\count185
|
||||
\c@section=\count186
|
||||
\c@subsection=\count187
|
||||
\c@subsubsection=\count188
|
||||
\c@paragraph=\count189
|
||||
\c@subparagraph=\count190
|
||||
\c@figure=\count191
|
||||
\c@table=\count192
|
||||
\abovecaptionskip=\skip47
|
||||
\belowcaptionskip=\skip48
|
||||
\bibindent=\dimen139
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/engine/ctex-engine-xetex.def
|
||||
File: ctex-engine-xetex.def 2021/03/14 v2.5.6 XeLaTeX adapter (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.sty
|
||||
Package: xeCJK 2020/10/19 v3.8.6 Typesetting CJK scripts with XeLaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xtemplate/xtemplate.sty
|
||||
Package: xtemplate 2021-03-12 L3 Experimental prototype document functions
|
||||
\l__xtemplate_tmp_dim=\dimen140
|
||||
\l__xtemplate_tmp_int=\count193
|
||||
\l__xtemplate_tmp_muskip=\muskip16
|
||||
\l__xtemplate_tmp_skip=\skip49
|
||||
)
|
||||
\l__xeCJK_tmp_int=\count194
|
||||
\l__xeCJK_tmp_box=\box49
|
||||
\l__xeCJK_tmp_dim=\dimen141
|
||||
\l__xeCJK_tmp_skip=\skip50
|
||||
\g__xeCJK_space_factor_int=\count195
|
||||
\l__xeCJK_begin_int=\count196
|
||||
\l__xeCJK_end_int=\count197
|
||||
\c__xeCJK_CJK_class_int=\XeTeXcharclass1
|
||||
\c__xeCJK_FullLeft_class_int=\XeTeXcharclass2
|
||||
\c__xeCJK_FullRight_class_int=\XeTeXcharclass3
|
||||
\c__xeCJK_HalfLeft_class_int=\XeTeXcharclass4
|
||||
\c__xeCJK_HalfRight_class_int=\XeTeXcharclass5
|
||||
\c__xeCJK_NormalSpace_class_int=\XeTeXcharclass6
|
||||
\c__xeCJK_CM_class_int=\XeTeXcharclass7
|
||||
\c__xeCJK_HangulJamo_class_int=\XeTeXcharclass8
|
||||
\l__xeCJK_last_skip=\skip51
|
||||
\g__xeCJK_node_int=\count198
|
||||
\c__xeCJK_CJK_node_dim=\dimen142
|
||||
\c__xeCJK_CJK-space_node_dim=\dimen143
|
||||
\c__xeCJK_default_node_dim=\dimen144
|
||||
\c__xeCJK_default-space_node_dim=\dimen145
|
||||
\c__xeCJK_CJK-widow_node_dim=\dimen146
|
||||
\c__xeCJK_normalspace_node_dim=\dimen147
|
||||
\l__xeCJK_ccglue_skip=\skip52
|
||||
\l__xeCJK_ecglue_skip=\skip53
|
||||
\l__xeCJK_punct_kern_skip=\skip54
|
||||
\l__xeCJK_last_penalty_int=\count199
|
||||
\l__xeCJK_last_bound_dim=\dimen148
|
||||
\l__xeCJK_last_kern_dim=\dimen149
|
||||
\l__xeCJK_widow_penalty_int=\count266
|
||||
|
||||
Package xtemplate Info: Declaring object type 'xeCJK/punctuation' taking 0
|
||||
(xtemplate) argument(s) on line 2341.
|
||||
|
||||
\l__xeCJK_fixed_punct_width_dim=\dimen150
|
||||
\l__xeCJK_mixed_punct_width_dim=\dimen151
|
||||
\l__xeCJK_middle_punct_width_dim=\dimen152
|
||||
\l__xeCJK_fixed_margin_width_dim=\dimen153
|
||||
\l__xeCJK_mixed_margin_width_dim=\dimen154
|
||||
\l__xeCJK_middle_margin_width_dim=\dimen155
|
||||
\l__xeCJK_bound_punct_width_dim=\dimen156
|
||||
\l__xeCJK_bound_margin_width_dim=\dimen157
|
||||
\l__xeCJK_margin_minimum_dim=\dimen158
|
||||
\l__xeCJK_kerning_total_width_dim=\dimen159
|
||||
\l__xeCJK_same_align_margin_dim=\dimen160
|
||||
\l__xeCJK_different_align_margin_dim=\dimen161
|
||||
\l__xeCJK_kerning_margin_width_dim=\dimen162
|
||||
\l__xeCJK_kerning_margin_minimum_dim=\dimen163
|
||||
\l__xeCJK_bound_dim=\dimen164
|
||||
\l__xeCJK_reverse_bound_dim=\dimen165
|
||||
\l__xeCJK_margin_dim=\dimen166
|
||||
\l__xeCJK_minimum_bound_dim=\dimen167
|
||||
\l__xeCJK_kerning_margin_dim=\dimen168
|
||||
\g__xeCJK_family_int=\count267
|
||||
\l__xeCJK_fam_int=\count268
|
||||
\g__xeCJK_fam_allocation_int=\count269
|
||||
\l__xeCJK_verb_case_int=\count270
|
||||
\l__xeCJK_verb_exspace_skip=\skip55
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.sty
|
||||
Package: fontspec 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
|
||||
Package: fontspec-xetex 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
|
||||
\l__fontspec_script_int=\count271
|
||||
\l__fontspec_language_int=\count272
|
||||
\l__fontspec_strnum_int=\count273
|
||||
\l__fontspec_tmp_int=\count274
|
||||
\l__fontspec_tmpa_int=\count275
|
||||
\l__fontspec_tmpb_int=\count276
|
||||
\l__fontspec_tmpc_int=\count277
|
||||
\l__fontspec_em_int=\count278
|
||||
\l__fontspec_emdef_int=\count279
|
||||
\l__fontspec_strong_int=\count280
|
||||
\l__fontspec_strongdef_int=\count281
|
||||
\l__fontspec_tmpa_dim=\dimen169
|
||||
\l__fontspec_tmpb_dim=\dimen170
|
||||
\l__fontspec_tmpc_dim=\dimen171
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/fontenc.sty
|
||||
Package: fontenc 2020/08/10 v2.0s Standard LaTeX package
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
|
||||
File: xeCJK.cfg 2020/10/19 v3.8.6 Configuration file for xeCJK package
|
||||
))
|
||||
\ccwd=\dimen172
|
||||
\l__ctex_ccglue_skip=\skip56
|
||||
)
|
||||
\l__ctex_ziju_dim=\dimen173
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber.sty
|
||||
Package: zhnumber 2020/05/01 v2.8 Typesetting numbers with Chinese glyphs
|
||||
\l__zhnum_scale_int=\count282
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber-utf8.cfg
|
||||
File: zhnumber-utf8.cfg 2020/05/01 v2.8 Chinese numerals with UTF8 encoding
|
||||
))
|
||||
\l__ctex_heading_skip=\skip57
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/scheme/ctex-scheme-chinese-article.def
|
||||
File: ctex-scheme-chinese-article.def 2021/03/14 v2.5.6 Chinese scheme for article (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex-name-utf8.cfg
|
||||
File: ctex-name-utf8.cfg 2021/03/14 v2.5.6 Caption with encoding UTF-8 (CTEX)
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-mac.def
|
||||
File: ctex-fontset-mac.def 2021/03/14 v2.5.6 macOS fonts definition (CTEX)
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-macnew.def
|
||||
File: ctex-fontset-macnew.def 2021/03/14 v2.5.6 macOS fonts definition for El Capitan or later version (CTEX)
|
||||
|
||||
|
||||
Package fontspec Warning: Font "Songti SC Light" does not contain requested
|
||||
(fontspec) Script "CJK".
|
||||
|
||||
|
||||
Package fontspec Info: Font family 'SongtiSCLight(0)' created for font 'Songti
|
||||
(fontspec) SC Light' with options
|
||||
(fontspec) [Script={CJK},BoldItalicFont={Kaiti SC
|
||||
(fontspec) Bold},BoldFont={Songti SC Bold},ItalicFont={Kaiti SC}].
|
||||
(fontspec)
|
||||
(fontspec) This font family consists of the following NFSS
|
||||
(fontspec) series/shapes:
|
||||
(fontspec)
|
||||
(fontspec) - 'normal' (m/n) with NFSS spec.: <->"Songti SC
|
||||
(fontspec) Light/OT:language=dflt;"
|
||||
(fontspec) - 'small caps' (m/sc) with NFSS spec.:
|
||||
(fontspec) - 'bold' (b/n) with NFSS spec.: <->"Songti SC
|
||||
(fontspec) Bold/OT:language=dflt;"
|
||||
(fontspec) - 'bold small caps' (b/sc) with NFSS spec.:
|
||||
(fontspec) - 'italic' (m/it) with NFSS spec.: <->"Kaiti
|
||||
(fontspec) SC/OT:language=dflt;"
|
||||
(fontspec) - 'italic small caps' (m/scit) with NFSS spec.:
|
||||
(fontspec) - 'bold italic' (b/it) with NFSS spec.: <->"Kaiti SC
|
||||
(fontspec) Bold/OT:language=dflt;"
|
||||
(fontspec) - 'bold italic small caps' (b/scit) with NFSS spec.:
|
||||
|
||||
))) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex.cfg
|
||||
File: ctex.cfg 2021/03/14 v2.5.6 Configuration file (CTEX)
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithm.sty
|
||||
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
|
||||
Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/float/float.sty
|
||||
Package: float 2001/11/08 v1.3d Float enhancements (AL)
|
||||
\c@float@type=\count283
|
||||
\float@exts=\toks15
|
||||
\float@box=\box50
|
||||
\@float@everytoks=\toks16
|
||||
\@floatcapt=\box51
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/ifthen.sty
|
||||
Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
|
||||
)
|
||||
\@float@every@algorithm=\toks17
|
||||
\c@algorithm=\count284
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithmic.sty
|
||||
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
|
||||
Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/graphics/keyval.sty
|
||||
Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
|
||||
\KV@toks@=\toks18
|
||||
)
|
||||
\c@ALC@unique=\count285
|
||||
\c@ALC@line=\count286
|
||||
\c@ALC@rem=\count287
|
||||
\c@ALC@depth=\count288
|
||||
\ALC@tlm=\skip58
|
||||
\algorithmicindent=\skip59
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amssymb.sty
|
||||
Package: amssymb 2013/01/14 v3.01 AMS font symbols
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amsfonts.sty
|
||||
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
|
||||
\@emptytoks=\toks19
|
||||
\symAMSa=\mathgroup4
|
||||
\symAMSb=\mathgroup5
|
||||
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
|
||||
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsmath.sty
|
||||
Package: amsmath 2020/09/23 v2.17i AMS math features
|
||||
\@mathmargin=\skip60
|
||||
|
||||
For additional information on amsmath, use the `?' option.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amstext.sty
|
||||
Package: amstext 2000/06/29 v2.01 AMS text
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsgen.sty
|
||||
File: amsgen.sty 1999/11/30 v2.0 generic functions
|
||||
\@emptytoks=\toks20
|
||||
\ex@=\dimen174
|
||||
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsbsy.sty
|
||||
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
|
||||
\pmbraise@=\dimen175
|
||||
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsopn.sty
|
||||
Package: amsopn 2016/03/08 v2.02 operator names
|
||||
)
|
||||
\inf@bad=\count289
|
||||
LaTeX Info: Redefining \frac on input line 234.
|
||||
\uproot@=\count290
|
||||
\leftroot@=\count291
|
||||
LaTeX Info: Redefining \overline on input line 399.
|
||||
\classnum@=\count292
|
||||
\DOTSCASE@=\count293
|
||||
LaTeX Info: Redefining \ldots on input line 496.
|
||||
LaTeX Info: Redefining \dots on input line 499.
|
||||
LaTeX Info: Redefining \cdots on input line 620.
|
||||
\Mathstrutbox@=\box52
|
||||
\strutbox@=\box53
|
||||
\big@size=\dimen176
|
||||
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
|
||||
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
|
||||
\macc@depth=\count294
|
||||
\c@MaxMatrixCols=\count295
|
||||
\dotsspace@=\muskip17
|
||||
\c@parentequation=\count296
|
||||
\dspbrk@lvl=\count297
|
||||
\tag@help=\toks21
|
||||
\row@=\count298
|
||||
\column@=\count299
|
||||
\maxfields@=\count300
|
||||
\andhelp@=\toks22
|
||||
\eqnshift@=\dimen177
|
||||
\alignsep@=\dimen178
|
||||
\tagshift@=\dimen179
|
||||
\tagwidth@=\dimen180
|
||||
\totwidth@=\dimen181
|
||||
\lineht@=\dimen182
|
||||
\@envbody=\toks23
|
||||
\multlinegap=\skip61
|
||||
\multlinetaggap=\skip62
|
||||
\mathdisplay@stack=\toks24
|
||||
LaTeX Info: Redefining \[ on input line 2923.
|
||||
LaTeX Info: Redefining \] on input line 2924.
|
||||
) (./pseudocodes.aux)
|
||||
\openout1 = `pseudocodes.aux'.
|
||||
|
||||
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 9.
|
||||
LaTeX Font Info: ... okay on input line 9.
|
||||
ABD: EverySelectfont initializing macros
|
||||
LaTeX Info: Redefining \selectfont on input line 9.
|
||||
|
||||
Package fontspec Info: Adjusting the maths setup (use [no-math] to avoid
|
||||
(fontspec) this).
|
||||
|
||||
\symlegacymaths=\mathgroup6
|
||||
LaTeX Font Info: Overwriting symbol font `legacymaths' in version `bold'
|
||||
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \acute on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \grave on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \ddot on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \tilde on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \bar on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \breve on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \check on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \hat on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \dot on input line 9.
|
||||
LaTeX Font Info: Redeclaring math accent \mathring on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Delta on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Theta on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Xi on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Pi on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Phi on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Psi on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \Omega on input line 9.
|
||||
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 9.
|
||||
LaTeX Font Info: Redeclaring symbol font `operators' on input line 9.
|
||||
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
|
||||
(Font) `operators' in the math version `normal' on input line 9.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
|
||||
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 9.
|
||||
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
|
||||
(Font) `operators' in the math version `bold' on input line 9.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
|
||||
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 9.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
|
||||
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal'
|
||||
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal'
|
||||
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal'
|
||||
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal'
|
||||
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 9.
|
||||
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
|
||||
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold'
|
||||
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold'
|
||||
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 9.
|
||||
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold'
|
||||
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 9.
|
||||
LaTeX Font Info: Trying to load font information for U+msa on input line 20.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsa.fd
|
||||
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
|
||||
)
|
||||
LaTeX Font Info: Trying to load font information for U+msb on input line 20.
|
||||
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsb.fd
|
||||
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
|
||||
)
|
||||
Overfull \hbox (38.0069pt too wide) in paragraph at lines 32--33
|
||||
[] []\TU/SongtiSCLight(0)/m/n/10.95 计 算 实 际 的 $\OML/cmm/m/it/10.95 Q$ \TU/SongtiSCLight(0)/m/n/10.95 值,| 即 $\OML/cmm/m/it/10.95 y[] \OT1/cmr/m/n/10.95 = []$
|
||||
[]
|
||||
|
||||
[1
|
||||
|
||||
] [2
|
||||
|
||||
] (./pseudocodes.aux) )
|
||||
Here is how much of TeX's memory you used:
|
||||
7847 strings out of 476919
|
||||
208964 string characters out of 5821840
|
||||
529246 words of memory out of 5000000
|
||||
27739 multiletter control sequences out of 15000+600000
|
||||
410995 words of font info for 73 fonts, out of 8000000 for 9000
|
||||
1348 hyphenation exceptions out of 8191
|
||||
101i,11n,104p,414b,663s stack positions out of 5000i,500n,10000p,200000b,80000s
|
||||
|
||||
Output written on pseudocodes.pdf (2 pages).
|
||||
BIN
projects/assets/pseudocodes/pseudocodes.pdf
Normal file
BIN
projects/assets/pseudocodes/pseudocodes.synctex.gz
Normal file
63
projects/assets/pseudocodes/pseudocodes.tex
Normal file
@@ -0,0 +1,63 @@
|
||||
\documentclass[11pt]{ctexart}
|
||||
\usepackage{ctex}
|
||||
\usepackage{algorithm}
|
||||
\usepackage{algorithmic}
|
||||
\usepackage{amssymb}
|
||||
\usepackage{amsmath}
|
||||
|
||||
|
||||
\begin{document}
|
||||
|
||||
\begin{algorithm}
|
||||
\floatname{algorithm}{{DQN算法}}
|
||||
\renewcommand{\thealgorithm}{} % 去掉算法标号
|
||||
\caption{}
|
||||
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
|
||||
\renewcommand{\algorithmicensure}{\textbf{输出:}}
|
||||
\begin{algorithmic}
|
||||
% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
|
||||
% \ENSURE $y = x^n$ % 输出
|
||||
\STATE 初始化策略网络参数$\theta$ % 初始化
|
||||
\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
|
||||
\STATE 初始化经验回放$D$
|
||||
\FOR {回合数 = $1,M$}
|
||||
\STATE 重置环境,获得初始状态$s_t$
|
||||
\FOR {时步 = $1,t$}
|
||||
\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
|
||||
\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
|
||||
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中
|
||||
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
|
||||
\STATE {\bfseries 更新策略:}
|
||||
\STATE 从$D$中采样一个batch的transition
|
||||
\STATE 计算实际的$Q$值,即$y_{j}= \begin{cases}r_{j} & \text {对于终止状态} s_{j+1} \\ r_{j}+\gamma \max _{a^{\prime}} Q\left(s_{j+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{j+1}\end{cases}$
|
||||
\STATE 对损失 $\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降
|
||||
\STATE 每$C$步复制参数$\hat{Q} \leftarrow Q$
|
||||
\ENDFOR
|
||||
\ENDFOR
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
|
||||
\clearpage
|
||||
|
||||
\begin{algorithm}
|
||||
\floatname{algorithm}{{SoftQ算法}}
|
||||
\renewcommand{\thealgorithm}{} % 去掉算法标号
|
||||
\caption{}
|
||||
\begin{algorithmic}
|
||||
\STATE 初始化参数$\theta$和$\phi$% 初始化
|
||||
\STATE 复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$
|
||||
\STATE 初始化经验回放$D$
|
||||
\FOR {回合数 = $1,M$}
|
||||
\FOR {时步 = $1,t$}
|
||||
\STATE 根据$a_{t} \leftarrow f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)$采样动作,其中$\xi \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$
|
||||
\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
|
||||
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中
|
||||
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
|
||||
\STATE 待完善
|
||||
\ENDFOR
|
||||
\ENDFOR
|
||||
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
|
||||
\end{document}
|
||||
@@ -5,7 +5,7 @@
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2022-07-20 23:57:16
|
||||
LastEditTime: 2022-08-11 09:52:23
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
@@ -14,77 +14,39 @@ LastEditTime: 2022-07-20 23:57:16
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
n_states: 输入的特征数即环境的状态维度
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, n_states,n_actions,cfg):
|
||||
def __init__(self,n_actions,model,memory,cfg):
|
||||
|
||||
self.n_actions = n_actions
|
||||
self.device = torch.device(cfg.device) # cpu or cuda
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
self.frame_idx = 0 # 用于epsilon的衰减计数
|
||||
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.sample_count = 0 # 用于epsilon的衰减计数
|
||||
self.epsilon = cfg.epsilon_start
|
||||
self.sample_count = 0
|
||||
self.epsilon_start = cfg.epsilon_start
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = MLP(n_states,n_actions).to(self.device)
|
||||
self.target_net = MLP(n_states,n_actions).to(self.device)
|
||||
self.policy_net = model.to(self.device)
|
||||
self.target_net = model.to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
|
||||
self.memory = memory # 经验回放
|
||||
|
||||
def choose_action(self, state):
|
||||
def sample(self, state):
|
||||
''' 选择动作
|
||||
'''
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
self.sample_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||
math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的,这里选择指数递减
|
||||
if random.random() > self.epsilon:
|
||||
with torch.no_grad():
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
q_values = self.policy_net(state)
|
||||
@@ -92,11 +54,16 @@ class DQN:
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def predict(self,state):
|
||||
with torch.no_grad():
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
return
|
||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
||||
# print('updating')
|
||||
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
@@ -118,9 +85,11 @@ class DQN:
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
|
||||
from pathlib import Path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
|
||||
|
||||
def load(self, path):
|
||||
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
|
||||
self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.autograd as autograd
|
||||
import random
|
||||
import math
|
||||
class CNN(nn.Module):
|
||||
def __init__(self, input_dim, output_dim):
|
||||
super(CNN, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
self.features = nn.Sequential(
|
||||
nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(32, 64, kernel_size=4, stride=2),
|
||||
nn.ReLU(),
|
||||
nn.Conv2d(64, 64, kernel_size=3, stride=1),
|
||||
nn.ReLU()
|
||||
)
|
||||
|
||||
self.fc = nn.Sequential(
|
||||
nn.Linear(self.feature_size(), 512),
|
||||
nn.ReLU(),
|
||||
nn.Linear(512, self.output_dim)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.features(x)
|
||||
x = x.view(x.size(0), -1)
|
||||
x = self.fc(x)
|
||||
return x
|
||||
|
||||
def feature_size(self):
|
||||
return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1)
|
||||
|
||||
|
||||
def act(self, state, epsilon):
|
||||
if random.random() > epsilon:
|
||||
state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
|
||||
q_value = self.forward(state)
|
||||
action = q_value.max(1)[1].data[0]
|
||||
else:
|
||||
action = random.randrange(env.action_space.n)
|
||||
return action
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
self.frame_idx = 0 # 用于epsilon的衰减计数
|
||||
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = CNN(n_states, n_actions).to(self.device)
|
||||
self.target_net = CNN(n_states, n_actions).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
|
||||
|
||||
def choose_action(self, state):
|
||||
''' 选择动作
|
||||
'''
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
with torch.no_grad():
|
||||
print(type(state))
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
return
|
||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
# 转为张量
|
||||
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
|
||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
|
||||
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
|
||||
# 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
|
||||
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
|
||||
# 优化更新模型
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
|
||||
|
||||
def load(self, path):
|
||||
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
@@ -1,142 +0,0 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.autograd as autograd
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
class CNN(nn.Module):
|
||||
def __init__(self, n_frames, n_actions):
|
||||
super(CNN,self).__init__()
|
||||
self.n_frames = n_frames
|
||||
self.n_actions = n_actions
|
||||
|
||||
# Layers
|
||||
self.conv1 = nn.Conv2d(
|
||||
in_channels=n_frames,
|
||||
out_channels=16,
|
||||
kernel_size=8,
|
||||
stride=4,
|
||||
padding=2
|
||||
)
|
||||
self.conv2 = nn.Conv2d(
|
||||
in_channels=16,
|
||||
out_channels=32,
|
||||
kernel_size=4,
|
||||
stride=2,
|
||||
padding=1
|
||||
)
|
||||
self.fc1 = nn.Linear(
|
||||
in_features=3200,
|
||||
out_features=256,
|
||||
)
|
||||
self.fc2 = nn.Linear(
|
||||
in_features=256,
|
||||
out_features=n_actions,
|
||||
)
|
||||
|
||||
# Activation Functions
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def flatten(self, x):
|
||||
batch_size = x.size()[0]
|
||||
x = x.view(batch_size, -1)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
# Forward pass
|
||||
x = self.relu(self.conv1(x)) # In: (80, 80, 4) Out: (20, 20, 16)
|
||||
x = self.relu(self.conv2(x)) # In: (20, 20, 16) Out: (10, 10, 32)
|
||||
x = self.flatten(x) # In: (10, 10, 32) Out: (3200,)
|
||||
x = self.relu(self.fc1(x)) # In: (3200,) Out: (256,)
|
||||
x = self.fc2(x) # In: (256,) Out: (4,)
|
||||
|
||||
return x
|
||||
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # 经验回放的容量
|
||||
self.buffer = [] # 缓冲区
|
||||
self.position = 0
|
||||
|
||||
def push(self, state, action, reward, next_state, done):
|
||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
||||
'''
|
||||
if len(self.buffer) < self.capacity:
|
||||
self.buffer.append(None)
|
||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
||||
self.position = (self.position + 1) % self.capacity
|
||||
|
||||
def sample(self, batch_size):
|
||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
||||
return state, action, reward, next_state, done
|
||||
|
||||
def __len__(self):
|
||||
''' 返回当前存储的量
|
||||
'''
|
||||
return len(self.buffer)
|
||||
|
||||
class DQN:
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = cfg.device # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma # 奖励的折扣因子
|
||||
# e-greedy策略相关参数
|
||||
self.frame_idx = 0 # 用于epsilon的衰减计数
|
||||
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
|
||||
(cfg.epsilon_start - cfg.epsilon_end) * \
|
||||
math.exp(-1. * frame_idx / cfg.epsilon_decay)
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = CNN(n_states, n_actions).to(self.device)
|
||||
self.target_net = CNN(n_states, n_actions).to(self.device)
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
|
||||
target_param.data.copy_(param.data)
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
|
||||
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
|
||||
|
||||
def choose_action(self, state):
|
||||
''' 选择动作
|
||||
'''
|
||||
self.frame_idx += 1
|
||||
if random.random() > self.epsilon(self.frame_idx):
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
||||
return
|
||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
# 转为张量
|
||||
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
|
||||
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
|
||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
|
||||
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
|
||||
# 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
||||
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
|
||||
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
|
||||
# 优化更新模型
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step()
|
||||
|
||||
def save(self, path):
|
||||
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
|
||||
|
||||
def load(self, path):
|
||||
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"algo_name": "DQN",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 200,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.95,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 500,
|
||||
"lr": 0.0001,
|
||||
"memory_capacity": 100000,
|
||||
"batch_size": 64,
|
||||
"target_update": 4,
|
||||
"hidden_dim": 256,
|
||||
"deivce": "cpu",
|
||||
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-211653/results/",
|
||||
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-211653/models/",
|
||||
"save_fig": true
|
||||
}
|
||||
|
Before Width: | Height: | Size: 28 KiB |
|
Before Width: | Height: | Size: 48 KiB |
@@ -0,0 +1 @@
|
||||
{"algo_name": "DQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/DQN/outputs/CartPole-v0/20220815-185119/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/DQN/outputs/CartPole-v0/20220815-185119/models/", "show_fig": false, "save_fig": true}
|
||||
|
After Width: | Height: | Size: 27 KiB |
|
After Width: | Height: | Size: 44 KiB |
@@ -1,23 +1,23 @@
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import numpy as np
|
||||
import argparse
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import save_results
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer
|
||||
from dqn import DQN
|
||||
|
||||
def get_args():
|
||||
""" Hyperparameters
|
||||
""" 超参数
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
@@ -36,7 +36,8 @@ def get_args():
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # path to save models
|
||||
'/' + curr_time + '/models/' )
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
@@ -47,8 +48,10 @@ def env_agent_config(cfg,seed=1):
|
||||
env = gym.make(cfg.env_name) # 创建环境
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
print(f"n states: {n_states}, n actions: {n_actions}")
|
||||
agent = DQN(n_states,n_actions, cfg) # 创建智能体
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
model = MLP(n_states,n_actions,hidden_dim=cfg.hidden_dim)
|
||||
memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
|
||||
agent = DQN(n_actions,model,memory,cfg) # 创建智能体
|
||||
if seed !=0: # 设置随机种子
|
||||
torch.manual_seed(seed)
|
||||
env.seed(seed)
|
||||
@@ -56,12 +59,11 @@ def env_agent_config(cfg,seed=1):
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
''' Training
|
||||
''' 训练
|
||||
'''
|
||||
print('Start training!')
|
||||
print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}')
|
||||
print("开始训练!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
steps = []
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
@@ -69,7 +71,7 @@ def train(cfg, env, agent):
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
ep_step += 1
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
action = agent.sample(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
agent.memory.push(state, action, reward,
|
||||
next_state, done) # 保存transition
|
||||
@@ -82,27 +84,17 @@ def train(cfg, env, agent):
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep + 1) % 1 == 0:
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
|
||||
print('Finish training!')
|
||||
if (i_ep + 1) % 10 == 0:
|
||||
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f},Epislon:{agent.epsilon:.3f}')
|
||||
print("完成训练!")
|
||||
env.close()
|
||||
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
|
||||
res_dic = {'rewards':rewards}
|
||||
return res_dic
|
||||
|
||||
|
||||
def test(cfg, env, agent):
|
||||
print('Start testing!')
|
||||
print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}')
|
||||
############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ###############
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
################################################################################
|
||||
print("开始测试!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
steps = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
@@ -110,7 +102,7 @@ def test(cfg, env, agent):
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
ep_step+=1
|
||||
action = agent.choose_action(state) # 选择动作
|
||||
action = agent.predict(state) # 选择动作
|
||||
next_state, reward, done, _ = env.step(action) # 更新环境,返回transition
|
||||
state = next_state # 更新下一个状态
|
||||
ep_reward += reward # 累加奖励
|
||||
@@ -118,14 +110,10 @@ def test(cfg, env, agent):
|
||||
break
|
||||
steps.append(ep_step)
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f'Episode:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
|
||||
print('Finish testing')
|
||||
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
|
||||
print("完成测试")
|
||||
env.close()
|
||||
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
|
||||
return {'rewards':rewards}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -133,16 +121,14 @@ if __name__ == "__main__":
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # save parameters
|
||||
agent.save(path=cfg.model_path) # save model
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
|
||||
save_args(cfg,path = cfg.result_path) # 保存参数到模型路径上
|
||||
agent.save(path = cfg.model_path) # 保存模型
|
||||
save_results(res_dic, tag = 'train', path = cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
env, agent = env_agent_config(cfg) # 也可以不加,加这一行的是为了避免训练之后环境可能会出现问题,因此新建一个环境用于测试
|
||||
agent.load(path = cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果
|
||||
path = cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test") # 画出结果
|
||||
|
||||
@@ -63,18 +63,18 @@ class MLP(nn.Module):
|
||||
return self.fc3(x)
|
||||
|
||||
class DoubleDQN:
|
||||
def __init__(self, n_states, n_actions, cfg):
|
||||
def __init__(self, n_states, n_actions, model, memory, cfg):
|
||||
self.n_actions = n_actions # 总的动作个数
|
||||
self.device = torch.device(cfg.device) # 设备,cpu或gpu等
|
||||
self.gamma = cfg.gamma
|
||||
# e-greedy策略相关参数
|
||||
self.actions_count = 0
|
||||
self.sample_count = 0
|
||||
self.epsilon_start = cfg.epsilon_start
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
|
||||
self.policy_net = model.to(self.device)
|
||||
self.target_net = model.to(self.device)
|
||||
# target_net copy from policy_net
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
target_param.data.copy_(param.data)
|
||||
@@ -82,13 +82,13 @@ class DoubleDQN:
|
||||
# 可查parameters()与state_dict()的区别,前者require_grad=True
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.loss = 0
|
||||
self.memory = ReplayBuffer(cfg.memory_capacity)
|
||||
self.memory = memory
|
||||
|
||||
def choose_action(self, state):
|
||||
def sample(self, state):
|
||||
'''选择动作
|
||||
'''
|
||||
self.actions_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.actions_count / self.epsilon_decay)
|
||||
self.sample_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||
if random.random() > self.epsilon:
|
||||
with torch.no_grad():
|
||||
# 先转为张量便于丢给神经网络,state元素数据原本为float64
|
||||
@@ -104,9 +104,16 @@ class DoubleDQN:
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
def predict(self, state):
|
||||
'''选择动作
|
||||
'''
|
||||
with torch.no_grad():
|
||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
||||
q_value = self.policy_net(state)
|
||||
action = q_value.max(1)[1].item()
|
||||
return action
|
||||
def update(self):
|
||||
|
||||
if len(self.memory) < self.batch_size:
|
||||
if len(self.memory) < self.batch_size: # 只有memory满了才会更新
|
||||
return
|
||||
# 从memory中随机采样transition
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
||||
@@ -150,7 +157,7 @@ class DoubleDQN:
|
||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step() # 更新模型
|
||||
|
||||
|
||||
def save(self,path):
|
||||
torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
|
||||
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
{
|
||||
"algo_name": "DoubleDQN",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 200,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.99,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 500,
|
||||
"lr": 0.0001,
|
||||
"memory_capacity": 100000,
|
||||
"batch_size": 64,
|
||||
"target_update": 2,
|
||||
"hidden_dim": 256,
|
||||
"device": "cuda",
|
||||
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/results/",
|
||||
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/models/",
|
||||
"save_fig": true
|
||||
}
|
||||
|
Before Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 44 KiB |
@@ -0,0 +1 @@
|
||||
{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.95, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cpu", "result_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/results/", "model_path": "/root/Desktop/rl-tutorials/codes/DoubleDQN/outputs/CartPole-v0/20220803-104127/models/", "save_fig": true}
|
||||
|
After Width: | Height: | Size: 34 KiB |
|
After Width: | Height: | Size: 43 KiB |
@@ -20,31 +20,33 @@ import argparse
|
||||
|
||||
from common.utils import save_results,make_dir
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer
|
||||
from DoubleDQN.double_dqn import DoubleDQN
|
||||
|
||||
def get_args():
|
||||
""" Hyperparameters
|
||||
""" 超参数
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
|
||||
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
|
||||
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
|
||||
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
|
||||
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
|
||||
parser.add_argument('--batch_size',default=64,type=int)
|
||||
parser.add_argument('--target_update',default=2,type=int)
|
||||
parser.add_argument('--target_update',default=4,type=int)
|
||||
parser.add_argument('--hidden_dim',default=256,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # path to save models
|
||||
'/' + curr_time + '/models/' ) # 保存模型的路径
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
@@ -55,19 +57,20 @@ def env_agent_config(cfg,seed=1):
|
||||
env.seed(seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
agent = DoubleDQN(n_states,n_actions,cfg)
|
||||
model = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim)
|
||||
memory = ReplayBuffer(cfg.memory_capacity)
|
||||
agent = DoubleDQN(n_states,n_actions,model,memory,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('Start training!')
|
||||
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
|
||||
print("开始训练!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录一回合内的奖励
|
||||
state = env.reset() # 重置环境,返回初始状态
|
||||
while True:
|
||||
action = agent.choose_action(state)
|
||||
action = agent.sample(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
agent.memory.push(state, action, reward, next_state, done)
|
||||
@@ -78,61 +81,45 @@ def train(cfg,env,agent):
|
||||
if i_ep % cfg.target_update == 0:
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
if (i_ep+1)%10 == 0:
|
||||
print(f'Env:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(
|
||||
0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('Finish training!')
|
||||
return {'rewards':rewards,'ma_rewards':ma_rewards}
|
||||
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f},Epislon:{agent.epsilon:.3f}')
|
||||
rewards.append(ep_reward)
|
||||
print("完成训练!")
|
||||
return {'rewards':rewards}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('Start testing')
|
||||
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
|
||||
############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ###############
|
||||
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
|
||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||
################################################################################
|
||||
print("开始测试!")
|
||||
print(f"回合:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
while True:
|
||||
action = agent.choose_action(state)
|
||||
action = agent.predict(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}")
|
||||
print('Finish testing!')
|
||||
return {'rewards':rewards,'ma_rewards':ma_rewards}
|
||||
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
|
||||
print("完成测试!")
|
||||
return {'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
print(cfg.device)
|
||||
# training
|
||||
env,agent = env_agent_config(cfg,seed=1)
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg,seed=1)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg)
|
||||
agent.save(path=cfg.model_path)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # 保存参数
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
|
||||
# testing
|
||||
env,agent = env_agent_config(cfg,seed=10)
|
||||
agent.load(path=cfg.model_path)
|
||||
res_dic = test(cfg,env,agent)
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg,seed=1)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="test")
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-12 16:14:34
|
||||
LastEditor: John
|
||||
LastEditTime: 2021-05-05 16:58:39
|
||||
LastEditTime: 2022-08-15 18:10:13
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -22,11 +22,10 @@ class FisrtVisitMC:
|
||||
self.epsilon = cfg.epsilon
|
||||
self.gamma = cfg.gamma
|
||||
self.Q_table = defaultdict(lambda: np.zeros(n_actions))
|
||||
self.returns_sum = defaultdict(float) # sum of returns
|
||||
self.returns_sum = defaultdict(float) # 保存return之和
|
||||
self.returns_count = defaultdict(float)
|
||||
|
||||
def choose_action(self,state):
|
||||
''' e-greed policy '''
|
||||
def sample(self,state):
|
||||
if state in self.Q_table.keys():
|
||||
best_action = np.argmax(self.Q_table[state])
|
||||
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
|
||||
@@ -35,6 +34,15 @@ class FisrtVisitMC:
|
||||
else:
|
||||
action = np.random.randint(0,self.n_actions)
|
||||
return action
|
||||
def predict(self,state):
|
||||
if state in self.Q_table.keys():
|
||||
best_action = np.argmax(self.Q_table[state])
|
||||
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
|
||||
action_probs[best_action] += (1.0 - self.epsilon)
|
||||
action = np.argmax(self.Q_table[state])
|
||||
else:
|
||||
action = np.random.randint(0,self.n_actions)
|
||||
return action
|
||||
def update(self,one_ep_transition):
|
||||
# Find all (state, action) pairs we've visited in this one_ep_transition
|
||||
# We convert each state to a tuple so that we can use it as a dict key
|
||||
@@ -50,16 +58,18 @@ class FisrtVisitMC:
|
||||
self.returns_sum[sa_pair] += G
|
||||
self.returns_count[sa_pair] += 1.0
|
||||
self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]
|
||||
def save(self,path):
|
||||
def save(self,path=None):
|
||||
'''把 Q表格 的数据保存到文件中
|
||||
'''
|
||||
from pathlib import Path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(
|
||||
obj=self.Q_table,
|
||||
f=path+"Q_table",
|
||||
pickle_module=dill
|
||||
)
|
||||
|
||||
def load(self, path):
|
||||
def load(self, path=None):
|
||||
'''从文件中读取数据到 Q表格
|
||||
'''
|
||||
self.Q_table =torch.load(f=path+"Q_table",pickle_module=dill)
|
||||
|
Before Width: | Height: | Size: 79 KiB |
|
Before Width: | Height: | Size: 38 KiB |
@@ -0,0 +1 @@
|
||||
{"algo_name": "First-Visit MC", "env_name": "Racetrack", "train_eps": 200, "test_eps": 20, "gamma": 0.9, "epsilon": 0.15, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/", "save_fig": true}
|
||||
|
After Width: | Height: | Size: 48 KiB |
|
After Width: | Height: | Size: 40 KiB |
110
projects/codes/MonteCarlo/task0.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-11 14:26:44
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-08-15 18:12:13
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import datetime
|
||||
import argparse
|
||||
from common.utils import save_results,save_args,plot_rewards
|
||||
|
||||
from MonteCarlo.agent import FisrtVisitMC
|
||||
from envs.racetrack_env import RacetrackEnv
|
||||
|
||||
curr_time = datetime.datetime.now().strftime(
|
||||
"%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
def get_args():
|
||||
""" 超参数
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='First-Visit MC',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='Racetrack',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.9,type=float,help="discounted factor")
|
||||
parser.add_argument('--epsilon',default=0.15,type=float,help="the probability to select a random action")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' )
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = RacetrackEnv()
|
||||
n_actions = env.action_space.n
|
||||
agent = FisrtVisitMC(n_actions, cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
print("开始训练!")
|
||||
print(f"环境:{cfg.env_name},算法:{cfg.algo_name},设备:{cfg.device}")
|
||||
rewards = []
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
one_ep_transition = []
|
||||
while True:
|
||||
action = agent.sample(state)
|
||||
next_state, reward, done = env.step(action)
|
||||
ep_reward += reward
|
||||
one_ep_transition.append((state, action, reward))
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
agent.update(one_ep_transition)
|
||||
if (i_ep+1) % 10 == 0:
|
||||
print(f"Episode:{i_ep+1}/{cfg.train_eps}: Reward:{ep_reward}")
|
||||
print("完成训练")
|
||||
return {'rewards':rewards}
|
||||
|
||||
def test(cfg, env, agent):
|
||||
print("开始测试!")
|
||||
print(f"环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}")
|
||||
rewards = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
while True:
|
||||
action = agent.predict(state)
|
||||
next_state, reward, done = env.step(action)
|
||||
ep_reward += reward
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print(f'回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.2f}')
|
||||
return {'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
save_args(cfg,path = cfg.result_path) # 保存参数到模型路径上
|
||||
agent.save(path = cfg.model_path) # 保存模型
|
||||
save_results(res_dic, tag = 'train', path = cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg) # 也可以不加,加这一行的是为了避免训练之后环境可能会出现问题,因此新建一个环境用于测试
|
||||
agent.load(path = cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path = cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test") # 画出结果
|
||||
@@ -1,118 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-11 14:26:44
|
||||
LastEditor: John
|
||||
LastEditTime: 2021-05-05 17:27:50
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(__file__)
|
||||
parent_path = os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
|
||||
import torch
|
||||
import datetime
|
||||
|
||||
from common.utils import save_results,make_dir
|
||||
from common.plot import plot_rewards
|
||||
from MonteCarlo.agent import FisrtVisitMC
|
||||
from envs.racetrack_env import RacetrackEnv
|
||||
|
||||
curr_time = datetime.datetime.now().strftime(
|
||||
"%Y%m%d-%H%M%S") # obtain current time
|
||||
|
||||
class MCConfig:
|
||||
def __init__(self):
|
||||
self.algo = "MC" # name of algo
|
||||
self.env = 'Racetrack'
|
||||
self.result_path = curr_path+"/outputs/" + self.env + \
|
||||
'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/outputs/" + self.env + \
|
||||
'/'+curr_time+'/models/' # path to save models
|
||||
# epsilon: The probability to select a random action .
|
||||
self.epsilon = 0.15
|
||||
self.gamma = 0.9 # gamma: Gamma discount factor.
|
||||
self.train_eps = 200
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # check gpu
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = RacetrackEnv()
|
||||
n_actions = 9
|
||||
agent = FisrtVisitMC(n_actions, cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
print('Start to eval !')
|
||||
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = [] # moving average rewards
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
one_ep_transition = []
|
||||
while True:
|
||||
action = agent.choose_action(state)
|
||||
next_state, reward, done = env.step(action)
|
||||
ep_reward += reward
|
||||
one_ep_transition.append((state, action, reward))
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
agent.update(one_ep_transition)
|
||||
if (i_ep+1) % 10 == 0:
|
||||
print(f"Episode:{i_ep+1}/{cfg.train_eps}: Reward:{ep_reward}")
|
||||
print('Complete training!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
def eval(cfg, env, agent):
|
||||
print('Start to eval !')
|
||||
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = [] # moving average rewards
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
while True:
|
||||
action = agent.choose_action(state)
|
||||
next_state, reward, done = env.step(action)
|
||||
ep_reward += reward
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
if (i_ep+1) % 10 == 0:
|
||||
print(f"Episode:{i_ep+1}/{cfg.train_eps}: Reward:{ep_reward}")
|
||||
return rewards, ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = MCConfig()
|
||||
|
||||
# train
|
||||
env,agent = env_agent_config(cfg,seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
|
||||
plot_rewards(rewards, ma_rewards, tag="train",
|
||||
algo=cfg.algo, path=cfg.result_path)
|
||||
# eval
|
||||
env,agent = env_agent_config(cfg,seed=10)
|
||||
agent.load(path=cfg.model_path)
|
||||
rewards,ma_rewards = eval(cfg,env,agent)
|
||||
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
|
||||
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
||||
|
Before Width: | Height: | Size: 23 KiB |
|
Before Width: | Height: | Size: 34 KiB |
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"algo_name": "Q-learning",
|
||||
"env_name": "CliffWalking-v0",
|
||||
"train_eps": 400,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.9,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 300,
|
||||
"lr": 0.1,
|
||||
"device": "cpu",
|
||||
"result_path": "/root/Desktop/rl-tutorials/codes/QLearning/outputs/CliffWalking-v0/20220802-163256/results/",
|
||||
"model_path": "/root/Desktop/rl-tutorials/codes/QLearning/outputs/CliffWalking-v0/20220802-163256/models/",
|
||||
"save_fig": true
|
||||
}
|
||||
|
After Width: | Height: | Size: 25 KiB |
|
After Width: | Height: | Size: 36 KiB |
@@ -15,18 +15,20 @@ import torch
|
||||
from collections import defaultdict
|
||||
|
||||
class QLearning(object):
|
||||
def __init__(self,n_states,
|
||||
def __init__(self,
|
||||
n_actions,cfg):
|
||||
self.n_actions = n_actions
|
||||
self.lr = cfg.lr # 学习率
|
||||
self.gamma = cfg.gamma
|
||||
self.epsilon = 0
|
||||
self.epsilon = cfg.epsilon_start
|
||||
self.sample_count = 0
|
||||
self.epsilon_start = cfg.epsilon_start
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表
|
||||
def choose_action(self, state):
|
||||
def sample(self, state):
|
||||
''' 采样动作,训练时用
|
||||
'''
|
||||
self.sample_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||
math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的,这里选择指数递减
|
||||
@@ -37,6 +39,8 @@ class QLearning(object):
|
||||
action = np.random.choice(self.n_actions) # 随机选择动作
|
||||
return action
|
||||
def predict(self,state):
|
||||
''' 预测或选择动作,测试时用
|
||||
'''
|
||||
action = np.argmax(self.Q_table[str(state)])
|
||||
return action
|
||||
def update(self, state, action, reward, next_state, done):
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-09-11 23:03:00
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-06-21 19:36:05
|
||||
LastEditTime: 2022-08-10 11:25:56
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -18,54 +18,45 @@ sys.path.append(parent_path) # 添加路径到系统路径
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
|
||||
from env.gridworld_env import CliffWalkingWapper
|
||||
import argparse
|
||||
from envs.gridworld_env import CliffWalkingWapper
|
||||
from qlearning import QLearning
|
||||
from common.utils import plot_rewards
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
def get_args():
|
||||
"""
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='Q-learning',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training") # 训练的回合数
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
|
||||
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor") # 折扣因子
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
|
||||
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
|
||||
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/',type=str )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/',type=str,help="path to save models")
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
class Config:
|
||||
'''超参数
|
||||
'''
|
||||
|
||||
def __init__(self):
|
||||
################################## 环境超参数 ###################################
|
||||
self.algo_name = 'Q-learning' # 算法名称
|
||||
self.env_name = 'CliffWalking-v0' # 环境名称
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
|
||||
self.seed = 10 # 随机种子,置0则不设置随机种子
|
||||
self.train_eps = 400 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
################################################################################
|
||||
|
||||
################################## 算法超参数 ###################################
|
||||
self.gamma = 0.90 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||
self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
|
||||
self.lr = 0.1 # 学习率
|
||||
################################################################################
|
||||
|
||||
################################# 保存结果相关参数 ################################
|
||||
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/results/' # 保存结果的路径
|
||||
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||
'/' + curr_time + '/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
################################################################################
|
||||
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录奖励
|
||||
ma_rewards = [] # 记录滑动平均奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录每个回合的奖励
|
||||
state = env.reset() # 重置环境,即开始新的回合
|
||||
while True:
|
||||
action = agent.choose_action(state) # 根据算法选择一个动作
|
||||
action = agent.sample(state) # 根据算法采样一个动作
|
||||
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
|
||||
agent.update(state, action, reward, next_state, done) # Q学习算法更新
|
||||
state = next_state # 更新状态
|
||||
@@ -73,19 +64,14 @@ def train(cfg,env,agent):
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
|
||||
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f},Epsilon:{agent.epsilon}")
|
||||
print('完成训练!')
|
||||
return rewards,ma_rewards
|
||||
return {"rewards":rewards}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
ma_rewards = [] # 滑动平均的奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录每个episode的reward
|
||||
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
|
||||
@@ -97,13 +83,9 @@ def test(cfg,env,agent):
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return rewards,ma_rewards
|
||||
return {"rewards":rewards}
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
'''创建环境和智能体
|
||||
@@ -119,23 +101,27 @@ def env_agent_config(cfg,seed=1):
|
||||
env.seed(seed) # 设置随机种子
|
||||
n_states = env.observation_space.n # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
agent = QLearning(n_states,n_actions,cfg)
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
agent = QLearning(n_actions,cfg)
|
||||
return env,agent
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
cfg = get_args()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg, seed=1)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # save parameters
|
||||
agent.save(path=cfg.model_path) # save model
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg, seed=10)
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test', path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
||||
|
||||
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 40 KiB |
|
Before Width: | Height: | Size: 49 KiB |
@@ -0,0 +1 @@
|
||||
{"algo_name": "Sarsa", "env_name": "CliffWalking-v0", "train_eps": 300, "test_eps": 20, "ep_max_steps": 200, "gamma": 0.99, "epsilon_start": 0.9, "epsilon_end": 0.01, "epsilon_decay": 200, "lr": 0.2, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220803-142740/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220803-142740/models/", "save_fig": true}
|
||||
|
After Width: | Height: | Size: 34 KiB |
|
After Width: | Height: | Size: 54 KiB |
@@ -0,0 +1,15 @@
|
||||
{
|
||||
"algo_name": "Sarsa",
|
||||
"env_name": "CliffWalking-v0",
|
||||
"train_eps": 400,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.9,
|
||||
"epsilon_start": 0.95,
|
||||
"epsilon_end": 0.01,
|
||||
"epsilon_decay": 300,
|
||||
"lr": 0.1,
|
||||
"device": "cpu",
|
||||
"result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\Sarsa/outputs/CliffWalking-v0/20220804-223029/results/",
|
||||
"model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\Sarsa/outputs/CliffWalking-v0/20220804-223029/models/",
|
||||
"save_fig": true
|
||||
}
|
||||
|
After Width: | Height: | Size: 25 KiB |
|
After Width: | Height: | Size: 34 KiB |
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-12 16:58:16
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-04-29 20:12:57
|
||||
LastEditTime: 2022-08-04 22:22:16
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -15,7 +15,7 @@ import torch
|
||||
import math
|
||||
class Sarsa(object):
|
||||
def __init__(self,
|
||||
n_actions,cfg,):
|
||||
n_actions,cfg):
|
||||
self.n_actions = n_actions
|
||||
self.lr = cfg.lr
|
||||
self.gamma = cfg.gamma
|
||||
@@ -24,7 +24,7 @@ class Sarsa(object):
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.Q = defaultdict(lambda: np.zeros(n_actions)) # Q table
|
||||
def choose_action(self, state):
|
||||
def sample(self, state):
|
||||
self.sample_count += 1
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||
math.exp(-1. * self.sample_count / self.epsilon_decay) # The probability to select a random action, is is log decayed
|
||||
@@ -33,14 +33,14 @@ class Sarsa(object):
|
||||
action_probs[best_action] += (1.0 - self.epsilon)
|
||||
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
|
||||
return action
|
||||
def predict_action(self,state):
|
||||
def predict(self,state):
|
||||
return np.argmax(self.Q[state])
|
||||
def update(self, state, action, reward, next_state, next_action,done):
|
||||
Q_predict = self.Q[state][action]
|
||||
if done:
|
||||
Q_target = reward # terminal state
|
||||
Q_target = reward # 终止状态
|
||||
else:
|
||||
Q_target = reward + self.gamma * self.Q[next_state][next_action]
|
||||
Q_target = reward + self.gamma * self.Q[next_state][next_action] # 与Q learning不同,Sarsa是拿下一步动作对应的Q值去更新
|
||||
self.Q[state][action] += self.lr * (Q_target - Q_predict)
|
||||
def save(self,path):
|
||||
'''把 Q表格 的数据保存到文件中
|
||||
|
||||
@@ -5,115 +5,114 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-11 17:59:16
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-04-29 20:18:13
|
||||
LastEditTime: 2022-08-04 22:28:51
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path of file
|
||||
parent_path = os.path.dirname(curr_path)
|
||||
sys.path.append(parent_path) # add current terminal path to sys.path
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import datetime
|
||||
import torch
|
||||
import argparse
|
||||
from envs.racetrack_env import RacetrackEnv
|
||||
from Sarsa.sarsa import Sarsa
|
||||
from common.utils import save_results,make_dir,plot_rewards
|
||||
from common.utils import save_results,make_dir,plot_rewards,save_args
|
||||
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
def get_args():
|
||||
""" 超参数
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") # 训练的回合数
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
|
||||
parser.add_argument('--ep_max_steps',default=200,type=int) # 每回合最大的部署
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") # 折扣因子
|
||||
parser.add_argument('--epsilon_start',default=0.90,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
|
||||
parser.add_argument('--epsilon_decay',default=200,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
|
||||
parser.add_argument('--lr',default=0.2,type=float,help="learning rate")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # path to save models
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
class Config:
|
||||
''' parameters for Sarsa
|
||||
'''
|
||||
def __init__(self):
|
||||
self.algo_name = 'Qlearning'
|
||||
self.env_name = 'CliffWalking-v0' # 0 up, 1 right, 2 down, 3 left
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check GPU
|
||||
self.result_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/results/' # path to save results
|
||||
self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models
|
||||
self.train_eps = 300 # training episodes
|
||||
self.test_eps = 20 # testing episodes
|
||||
self.n_steps = 200 # maximum steps per episode
|
||||
self.epsilon_start = 0.90 # start value of epsilon
|
||||
self.epsilon_end = 0.01 # end value of epsilon
|
||||
self.epsilon_decay = 200 # decay rate of epsilon
|
||||
self.gamma = 0.99 # gamma: Gamma discount factor.
|
||||
self.lr = 0.2 # learning rate: step size parameter
|
||||
self.save = True # if save figures
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
env = RacetrackEnv()
|
||||
n_states = 9 # number of actions
|
||||
agent = Sarsa(n_states,cfg)
|
||||
n_actions = 9 # 动作数
|
||||
agent = Sarsa(n_actions,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
rewards = []
|
||||
ma_rewards = []
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
action = agent.choose_action(state)
|
||||
action = agent.sample(state)
|
||||
ep_reward = 0
|
||||
# while True:
|
||||
for _ in range(cfg.n_steps):
|
||||
for _ in range(cfg.ep_max_steps):
|
||||
next_state, reward, done = env.step(action)
|
||||
ep_reward+=reward
|
||||
next_action = agent.choose_action(next_state)
|
||||
next_action = agent.sample(next_state)
|
||||
agent.update(state, action, reward, next_state, next_action,done)
|
||||
state = next_state
|
||||
action = next_action
|
||||
if done:
|
||||
break
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep+1)%2==0:
|
||||
print(f"Episode:{i_ep+1}, Reward:{ep_reward}, Epsilon:{agent.epsilon}")
|
||||
return rewards,ma_rewards
|
||||
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f},Epsilon:{agent.epsilon}")
|
||||
print('完成训练!')
|
||||
return {"rewards":rewards}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
# Print out which episode we're on, useful for debugging.
|
||||
# Generate an episode.
|
||||
# An episode is an array of (state, action, reward) tuples
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
while True:
|
||||
# for _ in range(cfg.n_steps):
|
||||
action = agent.predict_action(state)
|
||||
# while True:
|
||||
for _ in range(cfg.ep_max_steps):
|
||||
action = agent.predict(state)
|
||||
next_state, reward, done = env.step(action)
|
||||
ep_reward+=reward
|
||||
state = next_state
|
||||
if done:
|
||||
break
|
||||
if ma_rewards:
|
||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
rewards.append(ep_reward)
|
||||
if (i_ep+1)%1==0:
|
||||
print("Episode:{}/{}: Reward:{}".format(i_ep+1, cfg.test_eps,ep_reward))
|
||||
print('Complete testing!')
|
||||
return rewards,ma_rewards
|
||||
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return {"rewards":rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
env,agent = env_agent_config(cfg,seed=1)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
make_dir(cfg.result_path,cfg.model_path)
|
||||
agent.save(path=cfg.model_path)
|
||||
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="train")
|
||||
|
||||
env,agent = env_agent_config(cfg,seed=10)
|
||||
agent.load(path=cfg.model_path)
|
||||
rewards,ma_rewards = test(cfg,env,agent)
|
||||
save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="test")
|
||||
cfg = get_args()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # save parameters
|
||||
agent.save(path=cfg.model_path) # save model
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
||||
|
||||
|
||||
|
||||
|
||||
131
projects/codes/Sarsa/task1.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-09-11 23:03:00
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-08-04 22:44:00
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys
|
||||
import os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||
parent_path = os.path.dirname(curr_path) # 父路径
|
||||
sys.path.append(parent_path) # 添加路径到系统路径
|
||||
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import argparse
|
||||
from envs.gridworld_env import CliffWalkingWapper
|
||||
from Sarsa.sarsa import Sarsa
|
||||
from common.utils import plot_rewards,save_args
|
||||
from common.utils import save_results,make_dir
|
||||
|
||||
|
||||
def get_args():
|
||||
"""
|
||||
"""
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training") # 训练的回合数
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
|
||||
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor") # 折扣因子
|
||||
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
|
||||
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
|
||||
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
|
||||
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # path to save models
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args([])
|
||||
return args
|
||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('开始训练!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录奖励
|
||||
for i_ep in range(cfg.train_eps):
|
||||
ep_reward = 0 # 记录每个回合的奖励
|
||||
state = env.reset() # 重置环境,即开始新的回合
|
||||
action = agent.sample(state)
|
||||
while True:
|
||||
action = agent.sample(state) # 根据算法采样一个动作
|
||||
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
|
||||
next_action = agent.sample(next_state)
|
||||
agent.update(state, action, reward, next_state, next_action,done) # 算法更新
|
||||
state = next_state # 更新状态
|
||||
action = next_action
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f},Epsilon:{agent.epsilon}")
|
||||
print('完成训练!')
|
||||
return {"rewards":rewards}
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
rewards = [] # 记录所有回合的奖励
|
||||
for i_ep in range(cfg.test_eps):
|
||||
ep_reward = 0 # 记录每个episode的reward
|
||||
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
|
||||
while True:
|
||||
action = agent.predict(state) # 根据算法选择一个动作
|
||||
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
|
||||
state = next_state # 更新状态
|
||||
ep_reward += reward
|
||||
if done:
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
|
||||
print('完成测试!')
|
||||
return {"rewards":rewards}
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
'''创建环境和智能体
|
||||
Args:
|
||||
cfg ([type]): [description]
|
||||
seed (int, optional): 随机种子. Defaults to 1.
|
||||
Returns:
|
||||
env [type]: 环境
|
||||
agent : 智能体
|
||||
'''
|
||||
env = gym.make(cfg.env_name)
|
||||
env = CliffWalkingWapper(env)
|
||||
env.seed(seed) # 设置随机种子
|
||||
n_states = env.observation_space.n # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
agent = Sarsa(n_actions,cfg)
|
||||
return env,agent
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
# 训练
|
||||
env, agent = env_agent_config(cfg)
|
||||
res_dic = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path)
|
||||
save_args(cfg) # save parameters
|
||||
agent.save(path=cfg.model_path) # save model
|
||||
save_results(res_dic, tag='train',
|
||||
path=cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="train")
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-12 16:02:24
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-07-31 23:18:04
|
||||
LastEditTime: 2022-08-15 18:11:27
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -42,21 +42,36 @@ def plot_rewards_cn(rewards, ma_rewards, cfg, tag='train'):
|
||||
if cfg.save:
|
||||
plt.savefig(cfg.result_path+f"{tag}_rewards_curve_cn")
|
||||
# plt.show()
|
||||
def smooth(data, weight=0.9):
|
||||
'''用于平滑曲线,类似于Tensorboard中的smooth
|
||||
|
||||
Args:
|
||||
data (List):输入数据
|
||||
weight (Float): 平滑权重,处于0-1之间,数值越高说明越平滑,一般取0.9
|
||||
|
||||
def plot_rewards(rewards, ma_rewards, cfg, tag='train'):
|
||||
Returns:
|
||||
smoothed (List): 平滑后的数据
|
||||
'''
|
||||
last = data[0] # First value in the plot (first timestep)
|
||||
smoothed = list()
|
||||
for point in data:
|
||||
smoothed_val = last * weight + (1 - weight) * point # 计算平滑值
|
||||
smoothed.append(smoothed_val)
|
||||
last = smoothed_val
|
||||
return smoothed
|
||||
|
||||
def plot_rewards(rewards,cfg,path=None,tag='train'):
|
||||
sns.set()
|
||||
plt.figure() # 创建一个图形实例,方便同时多画几个图
|
||||
plt.title("learning curve on {} of {} for {}".format(
|
||||
cfg.device, cfg.algo_name, cfg.env_name))
|
||||
plt.title(f"{tag}ing curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}")
|
||||
plt.xlabel('epsiodes')
|
||||
plt.plot(rewards, label='rewards')
|
||||
plt.plot(ma_rewards, label='ma rewards')
|
||||
plt.plot(smooth(rewards), label='smoothed')
|
||||
plt.legend()
|
||||
if cfg.save_fig:
|
||||
plt.savefig(cfg.result_path+"{}_rewards_curve".format(tag))
|
||||
plt.show()
|
||||
|
||||
plt.savefig(f"{path}/{tag}ing_curve.png")
|
||||
if cfg.show_fig:
|
||||
plt.show()
|
||||
|
||||
def plot_losses(losses, algo="DQN", save=True, path='./'):
|
||||
sns.set()
|
||||
@@ -69,19 +84,13 @@ def plot_losses(losses, algo="DQN", save=True, path='./'):
|
||||
plt.savefig(path+"losses_curve")
|
||||
plt.show()
|
||||
|
||||
def save_results(dic, tag='train', path='./results'):
|
||||
def save_results(dic, tag='train', path = None):
|
||||
''' 保存奖励
|
||||
'''
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
for key,value in dic.items():
|
||||
np.save(path+'{}_{}.npy'.format(tag,key),value)
|
||||
print('Results saved!')
|
||||
|
||||
# def save_results(rewards, ma_rewards, tag='train', path='./results'):
|
||||
# ''' 保存奖励
|
||||
# '''
|
||||
# np.save(path+'{}_rewards.npy'.format(tag), rewards)
|
||||
# np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards)
|
||||
# print('Result saved!')
|
||||
|
||||
|
||||
def make_dir(*paths):
|
||||
@@ -100,27 +109,10 @@ def del_empty_dir(*paths):
|
||||
if not os.listdir(os.path.join(path, dir)):
|
||||
os.removedirs(os.path.join(path, dir))
|
||||
|
||||
def save_args(args):
|
||||
# save parameters
|
||||
args_dict = vars(args)
|
||||
with open(args.result_path+'params.json', 'w') as fp:
|
||||
def save_args(args,path=None):
|
||||
# 保存参数
|
||||
args_dict = vars(args)
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
with open(f"{path}/params.json", 'w') as fp:
|
||||
json.dump(args_dict, fp)
|
||||
print("Parameters saved!")
|
||||
def smooth(data, weight=0.9):
|
||||
'''_summary_
|
||||
|
||||
Args:
|
||||
data (List):输入数据
|
||||
weight (Float): 平滑权重,处于0-1之间,数值越高说明越平滑,一般取0.9
|
||||
|
||||
Returns:
|
||||
smoothed (List): 平滑后的数据
|
||||
'''
|
||||
last = data[0] # First value in the plot (first timestep)
|
||||
smoothed = list()
|
||||
for point in data:
|
||||
smoothed_val = last * weight + (1 - weight) * point # 计算平滑值
|
||||
smoothed.append(smoothed_val)
|
||||
last = smoothed_val
|
||||
|
||||
return smoothed
|
||||
print("参数已保存!")
|
||||
|
||||
@@ -1,26 +1,9 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import gym
|
||||
import turtle
|
||||
import numpy as np
|
||||
|
||||
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
|
||||
|
||||
|
||||
def GridWorld(gridmap=None, is_slippery=False):
|
||||
if gridmap is None:
|
||||
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
|
||||
@@ -4,6 +4,7 @@
|
||||
# This file contains code for the racetrack environment that you will be using
|
||||
# as part of the second part of the CM50270: Reinforcement Learning coursework.
|
||||
|
||||
import imp
|
||||
import time
|
||||
import random
|
||||
import numpy as np
|
||||
@@ -11,7 +12,7 @@ import os
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patheffects as pe
|
||||
from IPython.display import clear_output
|
||||
|
||||
from gym.spaces import Discrete
|
||||
from matplotlib import colors
|
||||
|
||||
class RacetrackEnv(object) :
|
||||
@@ -61,7 +62,7 @@ class RacetrackEnv(object) :
|
||||
if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
|
||||
self.initial_states.append((y, x))
|
||||
|
||||
|
||||
self.action_space = Discrete(9)
|
||||
self.is_reset = False
|
||||
|
||||
#print("Racetrack Environment File Loaded Successfully.")
|
||||
|
||||