更新PPO，增加PER DQN

2022-11-14 21:35:28 +08:00
parent dc78698262
commit b8aec4c188
34 changed files with 1993 additions and 476 deletions
--- a/projects/README.md
+++ b/projects/README.md
@@ -22,16 +22,19 @@

 注：点击对应的名称会跳到[codes](./codes/)下对应的算法中，其他版本还请读者自行翻阅

-|                算法名称                 |                           参考文献                           | 备注 |
-| :-------------------------------------: | :----------------------------------------------------------: | :--: |
-| [Policy Gradient](codes/PolicyGradient) | [Policy Gradient paper](https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf) |      |
-|                 DQN-CNN                 |                                                              | 待更 |
-|      [DoubleDQN](codes/DoubleDQN)       |     [Double DQN Paper](https://arxiv.org/abs/1509.06461)     |      |
-|          [SoftQ](codes/SoftQ)           |  [Soft Q-learning paper](https://arxiv.org/abs/1702.08165)   |      |
-|            [SAC](codes/SAC)             |      [SAC paper](https://arxiv.org/pdf/1812.05905.pdf)       |      |
-|        [SAC-Discrete](codes/SAC)        |  [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf)  |      |
-|                  SAC-S                  |       [SAC-S paper](https://arxiv.org/abs/1801.01290)        |      |
-|                  DSAC                   | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 |
+|                算法名称                 |                           参考文献                           |                         作者                         | 备注 |
+| :-------------------------------------: | :----------------------------------------------------------: | :--------------------------------------------------: | :--: |
+| [Policy Gradient](codes/PolicyGradient) | [Policy Gradient paper](https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf) |    [johnjim0816](https://github.com/johnjim0816)     |      |
+|     [Monte Carlo](codes/MonteCarlo)     |                                                              |    [johnjim0816](https://github.com/johnjim0816)     |      |
+|            [DQN](codes/DQN)             |                                                              |    [johnjim0816](https://github.com/johnjim0816)     |      |
+|                 DQN-CNN                 |                                                              |                                                      | 待更 |
+|        [PER_DQN](codes/PER_DQN)         |      [PER DQN Paper](https://arxiv.org/abs/1511.05952)       | [wangzhongren](https://github.com/wangzhongren-code) |      |
+|      [DoubleDQN](codes/DoubleDQN)       |     [Double DQN Paper](https://arxiv.org/abs/1509.06461)     |    [johnjim0816](https://github.com/johnjim0816)     |      |
+|          [SoftQ](codes/SoftQ)           |  [Soft Q-learning paper](https://arxiv.org/abs/1702.08165)   |    [johnjim0816](https://github.com/johnjim0816)     |      |
+|            [SAC](codes/SAC)             |      [SAC paper](https://arxiv.org/pdf/1812.05905.pdf)       |                                                      |      |
+|        [SAC-Discrete](codes/SAC)        |  [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf)  |                                                      |      |
+|                  SAC-S                  |       [SAC-S paper](https://arxiv.org/abs/1801.01290)        |                                                      |      |
+|                  DSAC                   | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) |                                                      | 待更 |

 ## 3. 算法环境

--- a/projects/assets/pseudocodes/pseudocodes.pdf
+++ b/projects/assets/pseudocodes/pseudocodes.pdf
--- a/projects/assets/pseudocodes/pseudocodes.tex
+++ b/projects/assets/pseudocodes/pseudocodes.tex
@@ -126,6 +126,46 @@
 \footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
 \footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步，但没有每$C$个回合稳定}
 \clearpage
+
+
+\section{PER\_DQN算法}
+\begin{algorithm}[H] % [H]固定位置
+    \floatname{algorithm}{{PER\_DQN算法}\footnotemark[1]}  
+    \renewcommand{\thealgorithm}{} % 去掉算法标号
+	\caption{} 
+    \renewcommand{\algorithmicrequire}{\textbf{输入:}}  
+    \renewcommand{\algorithmicensure}{\textbf{输出:}} 
+	\begin{algorithmic}[1]
+		% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
+		% \ENSURE $y = x^n$ % 输出
+		\STATE 初始化策略网络参数$\theta$ % 初始化
+		\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
+		\STATE 初始化经验回放$D$
+		\FOR {回合数 = $1,M$}
+			\STATE 重置环境，获得初始状态$s_t$
+			\FOR {时步 = $1,t$}
+				\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
+				\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
+				\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$，并根据TD-error损失确定其优先级$p_t$
+				\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
+				\STATE {\bfseries 更新策略：}
+				\STATE 按照经验回放中的优先级别，每个样本采样概率为$P(j)=p_j^\alpha / \sum_i p_i^\alpha$，从$D$中采样一个大小为batch的transition
+				\STATE 计算各个样本重要性采样权重 $w_j=(N \cdot P(j))^{-\beta} / \max _i w_i$
+				\STATE 计算TD-error $\delta_j$ ; 并根据TD-error更新优先级$p_j$
+				\STATE 计算实际的$Q$值，即$y_{j}$\footnotemark[2]
+				\STATE 根据重要性采样权重调整损失 $L(\theta)=\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\cdot w_j \right)^{2}$，并将其关于参数$\theta$做随机梯度下降\footnotemark[3]
+			\ENDFOR
+			\STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]]
+		\ENDFOR
+	\end{algorithmic}
+\end{algorithm}
+\footnotetext[1]{Playing Atari with Deep Reinforcement Learning}
+\footnotetext[2]{$y_{i}= \begin{cases}r_{i} & \text {对于终止状态} s_{i+1} \\ r_{i}+\gamma \max _{a^{\prime}} Q\left(s_{i+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{i+1}\end{cases}$}
+\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
+\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步，但没有每$C$个回合稳定}
+\clearpage
+
+
 \section{Policy Gradient算法}
 \begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{REINFORCE算法：Monte-Carlo Policy Gradient}\footnotemark[1]} 
--- a/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/Q_table
+++ b/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/Q_table
--- a/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/params.json
+++ b/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/params.json
@@ -1 +0,0 @@
-{"algo_name": "First-Visit MC", "env_name": "Racetrack", "train_eps": 200, "test_eps": 20, "gamma": 0.9, "epsilon": 0.15, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/", "save_fig": true}
--- a/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/test_rewards.npy
+++ b/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/test_rewards.npy
--- a/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/testing_curve.png
+++ b/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/testing_curve.png
--- a/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/train_rewards.npy
+++ b/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/train_rewards.npy
--- a/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/training_curve.png
+++ b/projects/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/training_curve.png
--- a/projects/codes/MonteCarlo/task0.py
+++ b/projects/codes/MonteCarlo/task0.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-11 14:26:44
 LastEditor: John
-LastEditTime: 2022-11-06 00:44:56
+LastEditTime: 2022-11-08 23:35:18
 Discription: 
 Environment: 
 '''
@@ -24,9 +24,6 @@ from common.launcher import Launcher
 from MonteCarlo.agent import FisrtVisitMC
 from MonteCarlo.config.config import GeneralConfigMC,AlgoConfigMC

-
-curr_time = datetime.datetime.now().strftime(
-    "%Y%m%d-%H%M%S")  # obtain current time
 class Main(Launcher):
    def __init__(self) -> None:
        super().__init__()
--- a/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/config.yaml
+++ b/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/config.yaml
@@ -0,0 +1,25 @@
+general_cfg:
+  algo_name: PER_DQN
+  device: cpu
+  env_name: CartPole-v1
+  eval_eps: 10
+  eval_per_episode: 5
+  load_checkpoint: true
+  load_path: Train_CartPole-v1_PER_DQN_20221113-162804
+  max_steps: 200
+  mode: test
+  save_fig: true
+  seed: 0
+  show_fig: false
+  test_eps: 10
+  train_eps: 200
+algo_cfg:
+  batch_size: 64
+  buffer_size: 100000
+  epsilon_decay: 500
+  epsilon_end: 0.01
+  epsilon_start: 0.95
+  gamma: 0.95
+  hidden_dim: 256
+  lr: 0.0001
+  target_update: 4
--- a/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/logs/log.txt
+++ b/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/logs/log.txt
@@ -0,0 +1,14 @@
+2022-11-14 10:46:49 - r - INFO: - n_states: 4, n_actions: 2
+2022-11-14 10:46:49 - r - INFO: - Start testing!
+2022-11-14 10:46:49 - r - INFO: - Env: CartPole-v1, Algorithm: PER_DQN, Device: cpu
+2022-11-14 10:46:49 - r - INFO: - Episode: 1/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 2/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 3/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 4/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 5/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 6/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 7/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 8/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 9/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Episode: 10/10, Reward: 200.000, Step: 200
+2022-11-14 10:46:49 - r - INFO: - Finish testing!
--- a/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/models/checkpoint.pt
+++ b/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/models/checkpoint.pt
--- a/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/results/learning_curve.png
+++ b/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/results/learning_curve.png
--- a/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/results/res.csv
+++ b/projects/codes/PER_DQN/Test_CartPole-v1_PER_DQN_20221114-104649/results/res.csv
@@ -0,0 +1,11 @@
+episodes,rewards,steps
+0,200.0,200
+1,200.0,200
+2,200.0,200
+3,200.0,200
+4,200.0,200
+5,200.0,200
+6,200.0,200
+7,200.0,200
+8,200.0,200
+9,200.0,200
--- a/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/config.yaml
+++ b/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/config.yaml
@@ -0,0 +1,25 @@
+general_cfg:
+  algo_name: PER_DQN
+  device: cuda
+  env_name: CartPole-v1
+  eval_eps: 10
+  eval_per_episode: 5
+  load_checkpoint: false
+  load_path: tasks
+  max_steps: 200
+  mode: train
+  save_fig: true
+  seed: 1
+  show_fig: false
+  test_eps: 10
+  train_eps: 200
+algo_cfg:
+  batch_size: 64
+  buffer_size: 100000
+  epsilon_decay: 500
+  epsilon_end: 0.01
+  epsilon_start: 0.95
+  gamma: 0.95
+  hidden_dim: 256
+  lr: 0.0001
+  target_update: 4
--- a/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/logs/log.txt
+++ b/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/logs/log.txt
@@ -0,0 +1,224 @@
+2022-11-13 16:28:04 - r - INFO: - n_states: 4, n_actions: 2
+2022-11-13 16:28:19 - r - INFO: - Start training!
+2022-11-13 16:28:19 - r - INFO: - Env: CartPole-v1, Algorithm: PER_DQN, Device: cuda
+2022-11-13 16:28:23 - r - INFO: - Episode: 1/200, Reward: 18.000, Step: 18
+2022-11-13 16:28:24 - r - INFO: - Episode: 2/200, Reward: 35.000, Step: 35
+2022-11-13 16:28:24 - r - INFO: - Episode: 3/200, Reward: 13.000, Step: 13
+2022-11-13 16:28:24 - r - INFO: - Episode: 4/200, Reward: 20.000, Step: 20
+2022-11-13 16:28:24 - r - INFO: - Episode: 5/200, Reward: 24.000, Step: 24
+2022-11-13 16:28:24 - r - INFO: - Current episode 5 has the best eval reward: 9.100
+2022-11-13 16:28:24 - r - INFO: - Episode: 6/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:24 - r - INFO: - Episode: 7/200, Reward: 20.000, Step: 20
+2022-11-13 16:28:24 - r - INFO: - Episode: 8/200, Reward: 19.000, Step: 19
+2022-11-13 16:28:25 - r - INFO: - Episode: 9/200, Reward: 30.000, Step: 30
+2022-11-13 16:28:25 - r - INFO: - Episode: 10/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:25 - r - INFO: - Current episode 10 has the best eval reward: 9.200
+2022-11-13 16:28:25 - r - INFO: - Episode: 11/200, Reward: 16.000, Step: 16
+2022-11-13 16:28:25 - r - INFO: - Episode: 12/200, Reward: 16.000, Step: 16
+2022-11-13 16:28:25 - r - INFO: - Episode: 13/200, Reward: 12.000, Step: 12
+2022-11-13 16:28:25 - r - INFO: - Episode: 14/200, Reward: 28.000, Step: 28
+2022-11-13 16:28:25 - r - INFO: - Episode: 15/200, Reward: 22.000, Step: 22
+2022-11-13 16:28:25 - r - INFO: - Current episode 15 has the best eval reward: 9.300
+2022-11-13 16:28:25 - r - INFO: - Episode: 16/200, Reward: 14.000, Step: 14
+2022-11-13 16:28:25 - r - INFO: - Episode: 17/200, Reward: 9.000, Step: 9
+2022-11-13 16:28:26 - r - INFO: - Episode: 18/200, Reward: 13.000, Step: 13
+2022-11-13 16:28:26 - r - INFO: - Episode: 19/200, Reward: 19.000, Step: 19
+2022-11-13 16:28:26 - r - INFO: - Episode: 20/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:26 - r - INFO: - Episode: 21/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:26 - r - INFO: - Episode: 22/200, Reward: 12.000, Step: 12
+2022-11-13 16:28:26 - r - INFO: - Episode: 23/200, Reward: 9.000, Step: 9
+2022-11-13 16:28:26 - r - INFO: - Episode: 24/200, Reward: 12.000, Step: 12
+2022-11-13 16:28:26 - r - INFO: - Episode: 25/200, Reward: 11.000, Step: 11
+2022-11-13 16:28:26 - r - INFO: - Current episode 25 has the best eval reward: 9.800
+2022-11-13 16:28:26 - r - INFO: - Episode: 26/200, Reward: 11.000, Step: 11
+2022-11-13 16:28:26 - r - INFO: - Episode: 27/200, Reward: 13.000, Step: 13
+2022-11-13 16:28:26 - r - INFO: - Episode: 28/200, Reward: 11.000, Step: 11
+2022-11-13 16:28:27 - r - INFO: - Episode: 29/200, Reward: 13.000, Step: 13
+2022-11-13 16:28:27 - r - INFO: - Episode: 30/200, Reward: 20.000, Step: 20
+2022-11-13 16:28:27 - r - INFO: - Current episode 30 has the best eval reward: 12.200
+2022-11-13 16:28:27 - r - INFO: - Episode: 31/200, Reward: 16.000, Step: 16
+2022-11-13 16:28:27 - r - INFO: - Episode: 32/200, Reward: 9.000, Step: 9
+2022-11-13 16:28:27 - r - INFO: - Episode: 33/200, Reward: 16.000, Step: 16
+2022-11-13 16:28:27 - r - INFO: - Episode: 34/200, Reward: 15.000, Step: 15
+2022-11-13 16:28:27 - r - INFO: - Episode: 35/200, Reward: 12.000, Step: 12
+2022-11-13 16:28:27 - r - INFO: - Current episode 35 has the best eval reward: 12.500
+2022-11-13 16:28:27 - r - INFO: - Episode: 36/200, Reward: 12.000, Step: 12
+2022-11-13 16:28:27 - r - INFO: - Episode: 37/200, Reward: 16.000, Step: 16
+2022-11-13 16:28:28 - r - INFO: - Episode: 38/200, Reward: 13.000, Step: 13
+2022-11-13 16:28:28 - r - INFO: - Episode: 39/200, Reward: 18.000, Step: 18
+2022-11-13 16:28:28 - r - INFO: - Episode: 40/200, Reward: 18.000, Step: 18
+2022-11-13 16:28:28 - r - INFO: - Current episode 40 has the best eval reward: 20.400
+2022-11-13 16:28:28 - r - INFO: - Episode: 41/200, Reward: 48.000, Step: 48
+2022-11-13 16:28:29 - r - INFO: - Episode: 42/200, Reward: 52.000, Step: 52
+2022-11-13 16:28:29 - r - INFO: - Episode: 43/200, Reward: 33.000, Step: 33
+2022-11-13 16:28:29 - r - INFO: - Episode: 44/200, Reward: 15.000, Step: 15
+2022-11-13 16:28:29 - r - INFO: - Episode: 45/200, Reward: 18.000, Step: 18
+2022-11-13 16:28:29 - r - INFO: - Episode: 46/200, Reward: 22.000, Step: 22
+2022-11-13 16:28:29 - r - INFO: - Episode: 47/200, Reward: 19.000, Step: 19
+2022-11-13 16:28:30 - r - INFO: - Episode: 48/200, Reward: 19.000, Step: 19
+2022-11-13 16:28:30 - r - INFO: - Episode: 49/200, Reward: 11.000, Step: 11
+2022-11-13 16:28:30 - r - INFO: - Episode: 50/200, Reward: 9.000, Step: 9
+2022-11-13 16:28:30 - r - INFO: - Episode: 51/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:30 - r - INFO: - Episode: 52/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:30 - r - INFO: - Episode: 53/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:30 - r - INFO: - Episode: 54/200, Reward: 10.000, Step: 10
+2022-11-13 16:28:30 - r - INFO: - Episode: 55/200, Reward: 9.000, Step: 9
+2022-11-13 16:28:30 - r - INFO: - Episode: 56/200, Reward: 17.000, Step: 17
+2022-11-13 16:28:31 - r - INFO: - Episode: 57/200, Reward: 75.000, Step: 75
+2022-11-13 16:28:31 - r - INFO: - Episode: 58/200, Reward: 28.000, Step: 28
+2022-11-13 16:28:31 - r - INFO: - Episode: 59/200, Reward: 30.000, Step: 30
+2022-11-13 16:28:32 - r - INFO: - Episode: 60/200, Reward: 54.000, Step: 54
+2022-11-13 16:28:32 - r - INFO: - Current episode 60 has the best eval reward: 34.600
+2022-11-13 16:28:32 - r - INFO: - Episode: 61/200, Reward: 22.000, Step: 22
+2022-11-13 16:28:32 - r - INFO: - Episode: 62/200, Reward: 28.000, Step: 28
+2022-11-13 16:28:32 - r - INFO: - Episode: 63/200, Reward: 26.000, Step: 26
+2022-11-13 16:28:33 - r - INFO: - Episode: 64/200, Reward: 32.000, Step: 32
+2022-11-13 16:28:33 - r - INFO: - Episode: 65/200, Reward: 30.000, Step: 30
+2022-11-13 16:28:33 - r - INFO: - Episode: 66/200, Reward: 29.000, Step: 29
+2022-11-13 16:28:34 - r - INFO: - Episode: 67/200, Reward: 28.000, Step: 28
+2022-11-13 16:28:34 - r - INFO: - Episode: 68/200, Reward: 38.000, Step: 38
+2022-11-13 16:28:34 - r - INFO: - Episode: 69/200, Reward: 28.000, Step: 28
+2022-11-13 16:28:34 - r - INFO: - Episode: 70/200, Reward: 22.000, Step: 22
+2022-11-13 16:28:34 - r - INFO: - Current episode 70 has the best eval reward: 36.700
+2022-11-13 16:28:35 - r - INFO: - Episode: 71/200, Reward: 40.000, Step: 40
+2022-11-13 16:28:35 - r - INFO: - Episode: 72/200, Reward: 27.000, Step: 27
+2022-11-13 16:28:35 - r - INFO: - Episode: 73/200, Reward: 24.000, Step: 24
+2022-11-13 16:28:35 - r - INFO: - Episode: 74/200, Reward: 47.000, Step: 47
+2022-11-13 16:28:36 - r - INFO: - Episode: 75/200, Reward: 127.000, Step: 127
+2022-11-13 16:28:37 - r - INFO: - Episode: 76/200, Reward: 48.000, Step: 48
+2022-11-13 16:28:37 - r - INFO: - Episode: 77/200, Reward: 27.000, Step: 27
+2022-11-13 16:28:37 - r - INFO: - Episode: 78/200, Reward: 65.000, Step: 65
+2022-11-13 16:28:38 - r - INFO: - Episode: 79/200, Reward: 75.000, Step: 75
+2022-11-13 16:28:38 - r - INFO: - Episode: 80/200, Reward: 47.000, Step: 47
+2022-11-13 16:28:38 - r - INFO: - Current episode 80 has the best eval reward: 37.200
+2022-11-13 16:28:39 - r - INFO: - Episode: 81/200, Reward: 34.000, Step: 34
+2022-11-13 16:28:39 - r - INFO: - Episode: 82/200, Reward: 38.000, Step: 38
+2022-11-13 16:28:39 - r - INFO: - Episode: 83/200, Reward: 24.000, Step: 24
+2022-11-13 16:28:39 - r - INFO: - Episode: 84/200, Reward: 47.000, Step: 47
+2022-11-13 16:28:40 - r - INFO: - Episode: 85/200, Reward: 35.000, Step: 35
+2022-11-13 16:28:40 - r - INFO: - Current episode 85 has the best eval reward: 66.900
+2022-11-13 16:28:41 - r - INFO: - Episode: 86/200, Reward: 103.000, Step: 103
+2022-11-13 16:28:41 - r - INFO: - Episode: 87/200, Reward: 64.000, Step: 64
+2022-11-13 16:28:42 - r - INFO: - Episode: 88/200, Reward: 59.000, Step: 59
+2022-11-13 16:28:43 - r - INFO: - Episode: 89/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:44 - r - INFO: - Episode: 90/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:46 - r - INFO: - Current episode 90 has the best eval reward: 200.000
+2022-11-13 16:28:47 - r - INFO: - Episode: 91/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:48 - r - INFO: - Episode: 92/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:50 - r - INFO: - Episode: 93/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:51 - r - INFO: - Episode: 94/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:52 - r - INFO: - Episode: 95/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:54 - r - INFO: - Current episode 95 has the best eval reward: 200.000
+2022-11-13 16:28:55 - r - INFO: - Episode: 96/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:56 - r - INFO: - Episode: 97/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:58 - r - INFO: - Episode: 98/200, Reward: 200.000, Step: 200
+2022-11-13 16:28:59 - r - INFO: - Episode: 99/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:00 - r - INFO: - Episode: 100/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:02 - r - INFO: - Current episode 100 has the best eval reward: 200.000
+2022-11-13 16:29:04 - r - INFO: - Episode: 101/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:05 - r - INFO: - Episode: 102/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:06 - r - INFO: - Episode: 103/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:07 - r - INFO: - Episode: 104/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:09 - r - INFO: - Episode: 105/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:10 - r - INFO: - Current episode 105 has the best eval reward: 200.000
+2022-11-13 16:29:11 - r - INFO: - Episode: 106/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:13 - r - INFO: - Episode: 107/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:14 - r - INFO: - Episode: 108/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:16 - r - INFO: - Episode: 109/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:17 - r - INFO: - Episode: 110/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:20 - r - INFO: - Episode: 111/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:21 - r - INFO: - Episode: 112/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:22 - r - INFO: - Episode: 113/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:23 - r - INFO: - Episode: 114/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:25 - r - INFO: - Episode: 115/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:26 - r - INFO: - Current episode 115 has the best eval reward: 200.000
+2022-11-13 16:29:27 - r - INFO: - Episode: 116/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:29 - r - INFO: - Episode: 117/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:30 - r - INFO: - Episode: 118/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:31 - r - INFO: - Episode: 119/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:33 - r - INFO: - Episode: 120/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:34 - r - INFO: - Current episode 120 has the best eval reward: 200.000
+2022-11-13 16:29:35 - r - INFO: - Episode: 121/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:37 - r - INFO: - Episode: 122/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:38 - r - INFO: - Episode: 123/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:39 - r - INFO: - Episode: 124/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:41 - r - INFO: - Episode: 125/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:43 - r - INFO: - Episode: 126/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:45 - r - INFO: - Episode: 127/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:46 - r - INFO: - Episode: 128/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:47 - r - INFO: - Episode: 129/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:49 - r - INFO: - Episode: 130/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:51 - r - INFO: - Episode: 131/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:53 - r - INFO: - Episode: 132/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:54 - r - INFO: - Episode: 133/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:55 - r - INFO: - Episode: 134/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:57 - r - INFO: - Episode: 135/200, Reward: 200.000, Step: 200
+2022-11-13 16:29:59 - r - INFO: - Episode: 136/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:01 - r - INFO: - Episode: 137/200, Reward: 185.000, Step: 185
+2022-11-13 16:30:02 - r - INFO: - Episode: 138/200, Reward: 193.000, Step: 193
+2022-11-13 16:30:03 - r - INFO: - Episode: 139/200, Reward: 192.000, Step: 192
+2022-11-13 16:30:04 - r - INFO: - Episode: 140/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:07 - r - INFO: - Episode: 141/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:08 - r - INFO: - Episode: 142/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:10 - r - INFO: - Episode: 143/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:11 - r - INFO: - Episode: 144/200, Reward: 191.000, Step: 191
+2022-11-13 16:30:12 - r - INFO: - Episode: 145/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:15 - r - INFO: - Episode: 146/200, Reward: 184.000, Step: 184
+2022-11-13 16:30:17 - r - INFO: - Episode: 147/200, Reward: 198.000, Step: 198
+2022-11-13 16:30:18 - r - INFO: - Episode: 148/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:19 - r - INFO: - Episode: 149/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:21 - r - INFO: - Episode: 150/200, Reward: 192.000, Step: 192
+2022-11-13 16:30:23 - r - INFO: - Episode: 151/200, Reward: 186.000, Step: 186
+2022-11-13 16:30:25 - r - INFO: - Episode: 152/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:26 - r - INFO: - Episode: 153/200, Reward: 194.000, Step: 194
+2022-11-13 16:30:27 - r - INFO: - Episode: 154/200, Reward: 199.000, Step: 199
+2022-11-13 16:30:29 - r - INFO: - Episode: 155/200, Reward: 183.000, Step: 183
+2022-11-13 16:30:32 - r - INFO: - Episode: 156/200, Reward: 173.000, Step: 173
+2022-11-13 16:30:33 - r - INFO: - Episode: 157/200, Reward: 197.000, Step: 197
+2022-11-13 16:30:34 - r - INFO: - Episode: 158/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:36 - r - INFO: - Episode: 159/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:37 - r - INFO: - Episode: 160/200, Reward: 196.000, Step: 196
+2022-11-13 16:30:40 - r - INFO: - Episode: 161/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:42 - r - INFO: - Episode: 162/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:43 - r - INFO: - Episode: 163/200, Reward: 194.000, Step: 194
+2022-11-13 16:30:44 - r - INFO: - Episode: 164/200, Reward: 185.000, Step: 185
+2022-11-13 16:30:45 - r - INFO: - Episode: 165/200, Reward: 173.000, Step: 173
+2022-11-13 16:30:48 - r - INFO: - Episode: 166/200, Reward: 192.000, Step: 192
+2022-11-13 16:30:49 - r - INFO: - Episode: 167/200, Reward: 164.000, Step: 164
+2022-11-13 16:30:50 - r - INFO: - Episode: 168/200, Reward: 188.000, Step: 188
+2022-11-13 16:30:52 - r - INFO: - Episode: 169/200, Reward: 189.000, Step: 189
+2022-11-13 16:30:53 - r - INFO: - Episode: 170/200, Reward: 197.000, Step: 197
+2022-11-13 16:30:55 - r - INFO: - Episode: 171/200, Reward: 187.000, Step: 187
+2022-11-13 16:30:57 - r - INFO: - Episode: 172/200, Reward: 200.000, Step: 200
+2022-11-13 16:30:58 - r - INFO: - Episode: 173/200, Reward: 195.000, Step: 195
+2022-11-13 16:30:59 - r - INFO: - Episode: 174/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:01 - r - INFO: - Episode: 175/200, Reward: 195.000, Step: 195
+2022-11-13 16:31:03 - r - INFO: - Episode: 176/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:05 - r - INFO: - Episode: 177/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:06 - r - INFO: - Episode: 178/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:07 - r - INFO: - Episode: 179/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:09 - r - INFO: - Episode: 180/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:11 - r - INFO: - Episode: 181/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:13 - r - INFO: - Episode: 182/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:14 - r - INFO: - Episode: 183/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:15 - r - INFO: - Episode: 184/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:17 - r - INFO: - Episode: 185/200, Reward: 173.000, Step: 173
+2022-11-13 16:31:19 - r - INFO: - Episode: 186/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:21 - r - INFO: - Episode: 187/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:22 - r - INFO: - Episode: 188/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:23 - r - INFO: - Episode: 189/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:24 - r - INFO: - Episode: 190/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:26 - r - INFO: - Current episode 190 has the best eval reward: 200.000
+2022-11-13 16:31:27 - r - INFO: - Episode: 191/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:29 - r - INFO: - Episode: 192/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:30 - r - INFO: - Episode: 193/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:31 - r - INFO: - Episode: 194/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:33 - r - INFO: - Episode: 195/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:34 - r - INFO: - Current episode 195 has the best eval reward: 200.000
+2022-11-13 16:31:35 - r - INFO: - Episode: 196/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:37 - r - INFO: - Episode: 197/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:38 - r - INFO: - Episode: 198/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:39 - r - INFO: - Episode: 199/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:40 - r - INFO: - Episode: 200/200, Reward: 200.000, Step: 200
+2022-11-13 16:31:42 - r - INFO: - Current episode 200 has the best eval reward: 200.000
+2022-11-13 16:31:42 - r - INFO: - Finish training!
--- a/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/models/checkpoint.pt
+++ b/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/models/checkpoint.pt
--- a/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/results/learning_curve.png
+++ b/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/results/learning_curve.png
--- a/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/results/res.csv
+++ b/projects/codes/PER_DQN/Train_CartPole-v1_PER_DQN_20221113-162804/results/res.csv
@@ -0,0 +1,201 @@
+episodes,rewards,steps
+0,18.0,18
+1,35.0,35
+2,13.0,13
+3,20.0,20
+4,24.0,24
+5,10.0,10
+6,20.0,20
+7,19.0,19
+8,30.0,30
+9,10.0,10
+10,16.0,16
+11,16.0,16
+12,12.0,12
+13,28.0,28
+14,22.0,22
+15,14.0,14
+16,9.0,9
+17,13.0,13
+18,19.0,19
+19,10.0,10
+20,10.0,10
+21,12.0,12
+22,9.0,9
+23,12.0,12
+24,11.0,11
+25,11.0,11
+26,13.0,13
+27,11.0,11
+28,13.0,13
+29,20.0,20
+30,16.0,16
+31,9.0,9
+32,16.0,16
+33,15.0,15
+34,12.0,12
+35,12.0,12
+36,16.0,16
+37,13.0,13
+38,18.0,18
+39,18.0,18
+40,48.0,48
+41,52.0,52
+42,33.0,33
+43,15.0,15
+44,18.0,18
+45,22.0,22
+46,19.0,19
+47,19.0,19
+48,11.0,11
+49,9.0,9
+50,10.0,10
+51,10.0,10
+52,10.0,10
+53,10.0,10
+54,9.0,9
+55,17.0,17
+56,75.0,75
+57,28.0,28
+58,30.0,30
+59,54.0,54
+60,22.0,22
+61,28.0,28
+62,26.0,26
+63,32.0,32
+64,30.0,30
+65,29.0,29
+66,28.0,28
+67,38.0,38
+68,28.0,28
+69,22.0,22
+70,40.0,40
+71,27.0,27
+72,24.0,24
+73,47.0,47
+74,127.0,127
+75,48.0,48
+76,27.0,27
+77,65.0,65
+78,75.0,75
+79,47.0,47
+80,34.0,34
+81,38.0,38
+82,24.0,24
+83,47.0,47
+84,35.0,35
+85,103.0,103
+86,64.0,64
+87,59.0,59
+88,200.0,200
+89,200.0,200
+90,200.0,200
+91,200.0,200
+92,200.0,200
+93,200.0,200
+94,200.0,200
+95,200.0,200
+96,200.0,200
+97,200.0,200
+98,200.0,200
+99,200.0,200
+100,200.0,200
+101,200.0,200
+102,200.0,200
+103,200.0,200
+104,200.0,200
+105,200.0,200
+106,200.0,200
+107,200.0,200
+108,200.0,200
+109,200.0,200
+110,200.0,200
+111,200.0,200
+112,200.0,200
+113,200.0,200
+114,200.0,200
+115,200.0,200
+116,200.0,200
+117,200.0,200
+118,200.0,200
+119,200.0,200
+120,200.0,200
+121,200.0,200
+122,200.0,200
+123,200.0,200
+124,200.0,200
+125,200.0,200
+126,200.0,200
+127,200.0,200
+128,200.0,200
+129,200.0,200
+130,200.0,200
+131,200.0,200
+132,200.0,200
+133,200.0,200
+134,200.0,200
+135,200.0,200
+136,185.0,185
+137,193.0,193
+138,192.0,192
+139,200.0,200
+140,200.0,200
+141,200.0,200
+142,200.0,200
+143,191.0,191
+144,200.0,200
+145,184.0,184
+146,198.0,198
+147,200.0,200
+148,200.0,200
+149,192.0,192
+150,186.0,186
+151,200.0,200
+152,194.0,194
+153,199.0,199
+154,183.0,183
+155,173.0,173
+156,197.0,197
+157,200.0,200
+158,200.0,200
+159,196.0,196
+160,200.0,200
+161,200.0,200
+162,194.0,194
+163,185.0,185
+164,173.0,173
+165,192.0,192
+166,164.0,164
+167,188.0,188
+168,189.0,189
+169,197.0,197
+170,187.0,187
+171,200.0,200
+172,195.0,195
+173,200.0,200
+174,195.0,195
+175,200.0,200
+176,200.0,200
+177,200.0,200
+178,200.0,200
+179,200.0,200
+180,200.0,200
+181,200.0,200
+182,200.0,200
+183,200.0,200
+184,173.0,173
+185,200.0,200
+186,200.0,200
+187,200.0,200
+188,200.0,200
+189,200.0,200
+190,200.0,200
+191,200.0,200
+192,200.0,200
+193,200.0,200
+194,200.0,200
+195,200.0,200
+196,200.0,200
+197,200.0,200
+198,200.0,200
+199,200.0,200
--- a/projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Test.yaml
+++ b/projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Test.yaml
@@ -0,0 +1,22 @@
+general_cfg:
+  algo_name: PER_DQN
+  device: cpu
+  env_name: CartPole-v1
+  mode: test
+  load_checkpoint: true
+  load_path: Train_CartPole-v1_PER_DQN_20221113-162804
+  max_steps: 200
+  save_fig: true
+  seed: 0
+  show_fig: false
+  test_eps: 10
+  train_eps: 200
+algo_cfg:
+  batch_size: 64
+  buffer_size: 100000
+  epsilon_decay: 500
+  epsilon_end: 0.01
+  epsilon_start: 0.95
+  gamma: 0.95
+  lr: 0.0001
+  target_update: 4
--- a/projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Train.yaml
+++ b/projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Train.yaml
@@ -0,0 +1,22 @@
+general_cfg:
+  algo_name: PER_DQN
+  device: cuda
+  env_name: CartPole-v1
+  mode: train
+  load_checkpoint: false
+  load_path: Train_CartPole-v1_PER_DQN_20221026-054757
+  max_steps: 200
+  save_fig: true
+  seed: 0
+  show_fig: false
+  test_eps: 10
+  train_eps: 200
+algo_cfg:
+  batch_size: 64
+  buffer_size: 100000
+  epsilon_decay: 500
+  epsilon_end: 0.01
+  epsilon_start: 0.95
+  gamma: 0.95
+  lr: 0.0001
+  target_update: 4
--- a/projects/codes/PER_DQN/config/config.py
+++ b/projects/codes/PER_DQN/config/config.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2022-10-30 00:37:33
+LastEditor: JiangJi
+LastEditTime: 2022-10-30 01:19:08
+Discription: default parameters of DQN
+'''
+from common.config import GeneralConfig,AlgoConfig
+class GeneralConfigDQN(GeneralConfig):
+    def __init__(self) -> None:
+        self.env_name = "CartPole-v1" # name of environment
+        self.algo_name = "PER_DQN" # name of algorithm
+        self.mode = "train" # train or test
+        self.seed = 1 # random seed
+        self.device = "cuda" # device to use
+        self.train_eps = 200 # number of episodes for training
+        self.test_eps = 10 # number of episodes for testing
+        self.max_steps = 200 # max steps for each episode
+        self.load_checkpoint = False
+        self.load_path = "tasks" # path to load model
+        self.show_fig = False # show figure or not
+        self.save_fig = True # save figure or not
+        
+class AlgoConfigDQN(AlgoConfig):
+    def __init__(self) -> None:
+        # set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
+        self.epsilon_start = 0.95 # epsilon start value
+        self.epsilon_end = 0.01 # epsilon end value
+        self.epsilon_decay = 500 # epsilon decay rate
+        self.hidden_dim = 256 # hidden_dim for MLP
+        self.gamma = 0.95 # discount factor
+        self.lr = 0.0001 # learning rate
+        self.buffer_size = 100000 # size of replay buffer
+        self.batch_size = 64 # batch size
+        self.target_update = 4 # target network update frequency
--- a/projects/codes/PER_DQN/per_dqn.py
+++ b/projects/codes/PER_DQN/per_dqn.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: DingLi
+Email: wangzhongren@sjtu.edu.cn
+Date: 2022-10-31 22:54:00
+LastEditor: DingLi
+LastEditTime: 2022-11-14 10:43:18
+Discription: CartPole-v1
+'''
+
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-12 00:50:49
+@LastEditor: John
+LastEditTime: 2022-10-26 07:50:24
+@Discription: 
+@Environment: python 3.7.7
+'''
+'''off-policy
+'''
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import random
+import math
+import numpy as np
+
+class PER_DQN:
+    def __init__(self,model,memory,cfg):
+
+        self.n_actions = cfg.n_actions  
+        self.device = torch.device(cfg.device) 
+        self.gamma = cfg.gamma  
+        ## e-greedy parameters
+        self.sample_count = 0  # sample count for epsilon decay
+        self.epsilon = cfg.epsilon_start
+        self.sample_count = 0  
+        self.epsilon_start = cfg.epsilon_start
+        self.epsilon_end = cfg.epsilon_end
+        self.epsilon_decay = cfg.epsilon_decay
+        self.batch_size = cfg.batch_size
+        self.policy_net = model.to(self.device)
+        self.target_net = model.to(self.device)
+        ## copy parameters from policy net to target net
+        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): 
+            target_param.data.copy_(param.data)
+        # self.target_net.load_state_dict(self.policy_net.state_dict()) # or use this to copy parameters
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) 
+        self.memory = memory 
+        self.update_flag = False 
+        
+    def sample_action(self, state):
+        ''' sample action with e-greedy policy
+        '''
+        self.sample_count += 1
+        # epsilon must decay(linear,exponential and etc.) for balancing exploration and exploitation
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+            math.exp(-1. * self.sample_count / self.epsilon_decay) 
+        if random.random() > self.epsilon:
+            with torch.no_grad():
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # choose action corresponding to the maximum q value
+        else:
+            action = random.randrange(self.n_actions)
+        return action
+    # @torch.no_grad()
+    # def sample_action(self, state):
+    #     ''' sample action with e-greedy policy
+    #     '''
+    #     self.sample_count += 1
+    #     # epsilon must decay(linear,exponential and etc.) for balancing exploration and exploitation
+    #     self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+    #         math.exp(-1. * self.sample_count / self.epsilon_decay) 
+    #     if random.random() > self.epsilon:
+    #         state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
+    #         q_values = self.policy_net(state)
+    #         action = q_values.max(1)[1].item() # choose action corresponding to the maximum q value
+    #     else:
+    #         action = random.randrange(self.n_actions)
+    #     return action
+    def predict_action(self,state):
+        ''' predict action
+        '''
+        with torch.no_grad():
+            state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
+            q_values = self.policy_net(state)
+            action = q_values.max(1)[1].item() # choose action corresponding to the maximum q value
+        return action
+    def update(self):
+        if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
+            # print ("self.batch_size = ", self.batch_size)
+            return
+        else:
+            if not self.update_flag:
+                print("Begin to update!")
+                self.update_flag = True
+        # sample a batch of transitions from replay buffer
+        (state_batch, action_batch, reward_batch, next_state_batch, done_batch), idxs_batch, is_weights_batch = self.memory.sample(
+            self.batch_size)
+        state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
+        next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
+        q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
+        next_max_q_value_batch = self.target_net(next_state_batch).max(1)[0].detach().unsqueeze(1) 
+        expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
+
+        loss = torch.mean(torch.pow((q_value_batch - expected_q_value_batch) * torch.from_numpy(is_weights_batch).cuda(), 2))
+        # loss = nn.MSELoss()(q_value_batch, expected_q_value_batch)  # shape same to  
+
+        abs_errors = np.sum(np.abs(q_value_batch.cpu().detach().numpy() - expected_q_value_batch.cpu().detach().numpy()), axis=1)
+        self.memory.batch_update(idxs_batch, abs_errors) 
+
+        # backpropagation
+        self.optimizer.zero_grad()  
+        loss.backward()
+        # clip to avoid gradient explosion
+        for param in self.policy_net.parameters():  
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+        if self.sample_count % self.target_update == 0: # target net update, target_update means "C" in pseucodes
+            self.target_net.load_state_dict(self.policy_net.state_dict())  
+
+    def save_model(self, fpath):
+        from pathlib import Path
+        # create path
+        Path(fpath).mkdir(parents=True, exist_ok=True)
+        torch.save(self.target_net.state_dict(), f"{fpath}/checkpoint.pt")
+
+    def load_model(self, fpath):
+        checkpoint = torch.load(f"{fpath}/checkpoint.pt",map_location=self.device)
+        self.target_net.load_state_dict(checkpoint)
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)
--- a/projects/codes/PER_DQN/task0.py
+++ b/projects/codes/PER_DQN/task0.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: DingLi
+Email: wangzhongren@sjtu.edu.cn
+Date: 2022-10-31 22:54:00
+LastEditor: DingLi
+LastEditTime: 2022-11-14 10:45:11
+Discription: CartPole-v1
+'''
+
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2022-10-12 11:09:54
+LastEditor: JiangJi
+LastEditTime: 2022-10-30 01:29:25
+Discription: CartPole-v1,Acrobot-v1
+'''
+import sys,os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # current path
+parent_path = os.path.dirname(curr_path)  # parent path
+sys.path.append(parent_path)  # add to system path
+import gym
+import torch
+
+from common.utils import all_seed,merge_class_attrs
+from common.models import MLP
+from common.memories import ReplayBuffer, ReplayTree
+from common.launcher import Launcher
+from envs.register import register_env
+from per_dqn import PER_DQN
+from config.config import GeneralConfigDQN,AlgoConfigDQN
+class Main(Launcher):
+    def __init__(self) -> None:
+        super().__init__()
+        self.cfgs['general_cfg'] = merge_class_attrs(self.cfgs['general_cfg'],GeneralConfigDQN())
+        self.cfgs['algo_cfg'] = merge_class_attrs(self.cfgs['algo_cfg'],AlgoConfigDQN())
+    def env_agent_config(self,cfg,logger):
+        ''' create env and agent
+        '''
+        register_env(cfg.env_name)
+        env = gym.make(cfg.env_name,new_step_api=True)  # create env
+        all_seed(env,seed=cfg.seed) # set random seed
+        try: # state dimension
+            n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
+        except AttributeError:
+            n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
+        n_actions = env.action_space.n  # action dimension
+        logger.info(f"n_states: {n_states}, n_actions: {n_actions}") # print info
+        # update to cfg paramters
+        setattr(cfg, 'n_states', n_states)
+        setattr(cfg, 'n_actions', n_actions)
+        # cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
+        model = MLP(n_states,n_actions,hidden_dim=cfg.hidden_dim)
+        memory =  ReplayTree(cfg.buffer_size) # replay SumTree
+        agent = PER_DQN(model,memory,cfg)  # create agent
+        return env, agent
+
+    def train_one_episode(self,env, agent, cfg):
+        ''' train one episode
+        '''
+        ep_step = 0
+        state = env.reset()  # reset and obtain initial state
+        for _ in range(cfg.max_steps):
+            ep_step += 1
+            action = agent.sample_action(state)  # sample action
+            next_state, reward, terminated, truncated , info = env.step(action)  # update env and return transitions under new_step_api of OpenAI Gym
+
+            policy_val = agent.policy_net(torch.tensor(state, device = cfg.device))[action]
+            target_val = agent.target_net(torch.tensor(next_state, device = cfg.device))
+
+            if terminated:
+                error = abs(policy_val - reward)
+            else:
+                error = abs(policy_val - reward - cfg.gamma * torch.max(target_val))
+            agent.memory.push(error.cpu().detach().numpy(), (state, action, reward,
+                            next_state, terminated))  # save transitions
+            state = next_state  # update next state for env
+            agent.update()  # update agent
+            ep_reward += reward  #
+            if terminated:
+                break
+        return agent, ep_reward, ep_step
+
+    def test_one_episode(self, env, agent, cfg):
+        ep_reward = 0  # reward per episode
+        ep_step = 0
+        state = env.reset()  # reset and obtain initial state
+        for _ in range(cfg.max_steps):
+            ep_step+=1
+            action = agent.predict_action(state)  # predict action
+            next_state, reward, terminated, _, _ = env.step(action)  
+            state = next_state  
+            ep_reward += reward 
+            if terminated:
+                break
+        return agent, ep_reward, ep_step
+
+
+if __name__ == "__main__":
+    main = Main()
+    main.run()
+
--- a/projects/codes/common/launcher.py
+++ b/projects/codes/common/launcher.py
@@ -36,11 +36,11 @@ class Launcher:
        ep_reward = 0
        ep_step = 0
        return agent,ep_reward,ep_step
-    def test_one_episode(self,env, agent, cfg):
+    def test_one_episode(self, env, agent, cfg):
        ep_reward = 0
        ep_step = 0
        return agent,ep_reward,ep_step
-    def evaluate(self,env, agent, cfg):
+    def evaluate(self, env, agent, cfg):
        sum_eval_reward = 0
        for _ in range(cfg.eval_eps):
            _,eval_ep_reward,_ = self.test_one_episode(env, agent, cfg)
--- a/projects/codes/common/memories.py
+++ b/projects/codes/common/memories.py
@@ -10,6 +10,7 @@ LastEditTime: 2022-08-28 23:44:06
@Environment: python 3.7.7
 '''
 import random
+import numpy as np
 from collections import deque
 class ReplayBuffer:
    def __init__(self, capacity):
@@ -71,4 +72,136 @@ class PGReplay(ReplayBufferQue):
        ''' sample all the transitions
        '''
        batch = list(self.buffer)
-        return zip(*batch)
+        return zip(*batch)
+
+class SumTree:
+    '''SumTree for the per(Prioritized Experience Replay) DQN. 
+    This SumTree code is a modified version and the original code is from:
+    https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
+    '''
+    def __init__(self, capacity: int):
+        self.capacity = capacity
+        self.data_pointer = 0
+        self.n_entries = 0
+        self.tree = np.zeros(2 * capacity - 1)
+        self.data = np.zeros(capacity, dtype = object)
+
+    def update(self, tree_idx, p):
+        '''Update the sampling weight
+        '''
+        change = p - self.tree[tree_idx]
+        self.tree[tree_idx] = p
+
+        while tree_idx != 0:
+            tree_idx = (tree_idx - 1) // 2
+            self.tree[tree_idx] += change
+
+    def add(self, p, data):
+        '''Adding new data to the sumTree
+        '''
+        tree_idx = self.data_pointer + self.capacity - 1
+        self.data[self.data_pointer] = data
+        # print ("tree_idx=", tree_idx)
+        # print ("nonzero = ", np.count_nonzero(self.tree))
+        self.update(tree_idx, p)
+
+        self.data_pointer += 1
+        if self.data_pointer >= self.capacity:
+            self.data_pointer = 0
+
+        if self.n_entries < self.capacity:
+            self.n_entries += 1
+
+    def get_leaf(self, v):
+        '''Sampling the data
+        '''
+        parent_idx = 0
+        while True:
+            cl_idx = 2 * parent_idx + 1
+            cr_idx = cl_idx + 1
+            if cl_idx >= len(self.tree):
+                leaf_idx = parent_idx
+                break
+            else:
+                if v <= self.tree[cl_idx] :
+                    parent_idx = cl_idx
+                else:
+                    v -= self.tree[cl_idx]
+                    parent_idx = cr_idx
+
+        data_idx = leaf_idx - self.capacity + 1
+        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
+
+    def total(self):
+        return int(self.tree[0])
+
+class ReplayTree:
+    '''ReplayTree for the per(Prioritized Experience Replay) DQN. 
+    '''
+    def __init__(self, capacity):
+        self.capacity = capacity # the capacity for memory replay
+        self.tree = SumTree(capacity)
+        self.abs_err_upper = 1.
+
+        ## hyper parameter for calculating the importance sampling weight
+        self.beta_increment_per_sampling = 0.001
+        self.alpha = 0.6
+        self.beta = 0.4
+        self.epsilon = 0.01 
+        self.abs_err_upper = 1.
+
+    def __len__(self):
+        ''' return the num of storage
+        '''
+        return self.tree.total()
+
+    def push(self, error, sample):
+        '''Push the sample into the replay according to the importance sampling weight
+        '''
+        p = (np.abs(error) + self.epsilon) ** self.alpha
+        self.tree.add(p, sample)         
+
+
+    def sample(self, batch_size):
+        '''This is for sampling a batch data and the original code is from:
+        https://github.com/rlcode/per/blob/master/prioritized_memory.py
+        '''
+        pri_segment = self.tree.total() / batch_size
+
+        priorities = []
+        batch = []
+        idxs = []
+
+        is_weights = []
+
+        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
+        min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total() 
+
+        for i in range(batch_size):
+            a = pri_segment * i
+            b = pri_segment * (i+1)
+
+            s = random.uniform(a, b)
+            idx, p, data = self.tree.get_leaf(s)
+
+            priorities.append(p)
+            batch.append(data)
+            idxs.append(idx)
+            prob = p / self.tree.total()
+
+        sampling_probabilities = np.array(priorities) / self.tree.total()
+        is_weights = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
+        is_weights /= is_weights.max()
+
+        return zip(*batch), idxs, is_weights
+    
+    def batch_update(self, tree_idx, abs_errors):
+        '''Update the importance sampling weight
+        '''
+        abs_errors += self.epsilon
+
+        clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
+        ps = np.power(clipped_errors, self.alpha)
+
+        for ti, p in zip(tree_idx, ps):
+            self.tree.update(ti, p)
--- a/projects/codes/common/utils.py
+++ b/projects/codes/common/utils.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-12 16:02:24
 LastEditor: John
-LastEditTime: 2022-10-26 07:38:17
+LastEditTime: 2022-11-14 10:27:43
 Discription: 
 Environment: 
 '''
@@ -179,6 +179,8 @@ def all_seed(env,seed = 1):
    import torch
    import numpy as np
    import random
+    if seed == 0:
+        return
    # print(f"seed = {seed}")
    env.seed(seed) # env config
    np.random.seed(seed)
--- a/projects/notebooks/1.QLearning.ipynb
+++ b/projects/notebooks/1.QLearning.ipynb
--- a/projects/notebooks/MonteCarlo.ipynb
+++ b/projects/notebooks/MonteCarlo.ipynb
--- a/projects/notebooks/Q-learning/Q-learning探索策略研究.ipynb
+++ b/projects/notebooks/Q-learning/Q-learning探索策略研究.ipynb
@@ -0,0 +1,32 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Q learning with different exploration strategies\n",
+    "\n",
+    "Authors: [johnjim0816](https://github.com/johnjim0816)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.13 ('easyrl')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.7.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "8994a120d39b6e6a2ecc94b4007f5314b68aa69fc88a7f00edf21be39b41f49c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/projects/notebooks/Q-learning/QLearning.ipynb
+++ b/projects/notebooks/Q-learning/QLearning.ipynb
--- a/projects/notebooks/Value
+++ b/projects/notebooks/Value
--- a/projects/requirements.txt
+++ b/projects/requirements.txt
@@ -1,10 +1,11 @@
 pyyaml==6.0
 ipykernel==6.15.1
 jupyter==1.0.0
-matplotlib==3.5.2
-seaborn==0.11.2
+matplotlib==3.5.3
+seaborn==0.12.1
 dill==0.3.5.1
 argparse==1.4.0
 pandas==1.3.5
 pyglet==1.5.26
-importlib-metadata<5.0
+importlib-metadata<5.0
+setuptools==65.2.0
				`@@ -1 +0,0 @@`
				`{"algo_name": "First-Visit MC", "env_name": "Racetrack", "train_eps": 200, "test_eps": 20, "gamma": 0.9, "epsilon": 0.15, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/", "save_fig": true}`