更新PPO,增加PER DQN
This commit is contained in:
Binary file not shown.
@@ -1 +0,0 @@
|
||||
{"algo_name": "First-Visit MC", "env_name": "Racetrack", "train_eps": 200, "test_eps": 20, "gamma": 0.9, "epsilon": 0.15, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/MonteCarlo/outputs/Racetrack/20220815-180742/models/", "save_fig": true}
|
||||
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 48 KiB |
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 40 KiB |
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-11 14:26:44
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-11-06 00:44:56
|
||||
LastEditTime: 2022-11-08 23:35:18
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -24,9 +24,6 @@ from common.launcher import Launcher
|
||||
from MonteCarlo.agent import FisrtVisitMC
|
||||
from MonteCarlo.config.config import GeneralConfigMC,AlgoConfigMC
|
||||
|
||||
|
||||
curr_time = datetime.datetime.now().strftime(
|
||||
"%Y%m%d-%H%M%S") # obtain current time
|
||||
class Main(Launcher):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
general_cfg:
|
||||
algo_name: PER_DQN
|
||||
device: cpu
|
||||
env_name: CartPole-v1
|
||||
eval_eps: 10
|
||||
eval_per_episode: 5
|
||||
load_checkpoint: true
|
||||
load_path: Train_CartPole-v1_PER_DQN_20221113-162804
|
||||
max_steps: 200
|
||||
mode: test
|
||||
save_fig: true
|
||||
seed: 0
|
||||
show_fig: false
|
||||
test_eps: 10
|
||||
train_eps: 200
|
||||
algo_cfg:
|
||||
batch_size: 64
|
||||
buffer_size: 100000
|
||||
epsilon_decay: 500
|
||||
epsilon_end: 0.01
|
||||
epsilon_start: 0.95
|
||||
gamma: 0.95
|
||||
hidden_dim: 256
|
||||
lr: 0.0001
|
||||
target_update: 4
|
||||
@@ -0,0 +1,14 @@
|
||||
2022-11-14 10:46:49 - r - INFO: - n_states: 4, n_actions: 2
|
||||
2022-11-14 10:46:49 - r - INFO: - Start testing!
|
||||
2022-11-14 10:46:49 - r - INFO: - Env: CartPole-v1, Algorithm: PER_DQN, Device: cpu
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 1/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 2/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 3/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 4/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 5/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 6/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 7/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 8/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 9/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Episode: 10/10, Reward: 200.000, Step: 200
|
||||
2022-11-14 10:46:49 - r - INFO: - Finish testing!
|
||||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
@@ -0,0 +1,11 @@
|
||||
episodes,rewards,steps
|
||||
0,200.0,200
|
||||
1,200.0,200
|
||||
2,200.0,200
|
||||
3,200.0,200
|
||||
4,200.0,200
|
||||
5,200.0,200
|
||||
6,200.0,200
|
||||
7,200.0,200
|
||||
8,200.0,200
|
||||
9,200.0,200
|
||||
|
@@ -0,0 +1,25 @@
|
||||
general_cfg:
|
||||
algo_name: PER_DQN
|
||||
device: cuda
|
||||
env_name: CartPole-v1
|
||||
eval_eps: 10
|
||||
eval_per_episode: 5
|
||||
load_checkpoint: false
|
||||
load_path: tasks
|
||||
max_steps: 200
|
||||
mode: train
|
||||
save_fig: true
|
||||
seed: 1
|
||||
show_fig: false
|
||||
test_eps: 10
|
||||
train_eps: 200
|
||||
algo_cfg:
|
||||
batch_size: 64
|
||||
buffer_size: 100000
|
||||
epsilon_decay: 500
|
||||
epsilon_end: 0.01
|
||||
epsilon_start: 0.95
|
||||
gamma: 0.95
|
||||
hidden_dim: 256
|
||||
lr: 0.0001
|
||||
target_update: 4
|
||||
@@ -0,0 +1,224 @@
|
||||
2022-11-13 16:28:04 - r - INFO: - n_states: 4, n_actions: 2
|
||||
2022-11-13 16:28:19 - r - INFO: - Start training!
|
||||
2022-11-13 16:28:19 - r - INFO: - Env: CartPole-v1, Algorithm: PER_DQN, Device: cuda
|
||||
2022-11-13 16:28:23 - r - INFO: - Episode: 1/200, Reward: 18.000, Step: 18
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 2/200, Reward: 35.000, Step: 35
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 3/200, Reward: 13.000, Step: 13
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 4/200, Reward: 20.000, Step: 20
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 5/200, Reward: 24.000, Step: 24
|
||||
2022-11-13 16:28:24 - r - INFO: - Current episode 5 has the best eval reward: 9.100
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 6/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 7/200, Reward: 20.000, Step: 20
|
||||
2022-11-13 16:28:24 - r - INFO: - Episode: 8/200, Reward: 19.000, Step: 19
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 9/200, Reward: 30.000, Step: 30
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 10/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:25 - r - INFO: - Current episode 10 has the best eval reward: 9.200
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 11/200, Reward: 16.000, Step: 16
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 12/200, Reward: 16.000, Step: 16
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 13/200, Reward: 12.000, Step: 12
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 14/200, Reward: 28.000, Step: 28
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 15/200, Reward: 22.000, Step: 22
|
||||
2022-11-13 16:28:25 - r - INFO: - Current episode 15 has the best eval reward: 9.300
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 16/200, Reward: 14.000, Step: 14
|
||||
2022-11-13 16:28:25 - r - INFO: - Episode: 17/200, Reward: 9.000, Step: 9
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 18/200, Reward: 13.000, Step: 13
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 19/200, Reward: 19.000, Step: 19
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 20/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 21/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 22/200, Reward: 12.000, Step: 12
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 23/200, Reward: 9.000, Step: 9
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 24/200, Reward: 12.000, Step: 12
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 25/200, Reward: 11.000, Step: 11
|
||||
2022-11-13 16:28:26 - r - INFO: - Current episode 25 has the best eval reward: 9.800
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 26/200, Reward: 11.000, Step: 11
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 27/200, Reward: 13.000, Step: 13
|
||||
2022-11-13 16:28:26 - r - INFO: - Episode: 28/200, Reward: 11.000, Step: 11
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 29/200, Reward: 13.000, Step: 13
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 30/200, Reward: 20.000, Step: 20
|
||||
2022-11-13 16:28:27 - r - INFO: - Current episode 30 has the best eval reward: 12.200
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 31/200, Reward: 16.000, Step: 16
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 32/200, Reward: 9.000, Step: 9
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 33/200, Reward: 16.000, Step: 16
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 34/200, Reward: 15.000, Step: 15
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 35/200, Reward: 12.000, Step: 12
|
||||
2022-11-13 16:28:27 - r - INFO: - Current episode 35 has the best eval reward: 12.500
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 36/200, Reward: 12.000, Step: 12
|
||||
2022-11-13 16:28:27 - r - INFO: - Episode: 37/200, Reward: 16.000, Step: 16
|
||||
2022-11-13 16:28:28 - r - INFO: - Episode: 38/200, Reward: 13.000, Step: 13
|
||||
2022-11-13 16:28:28 - r - INFO: - Episode: 39/200, Reward: 18.000, Step: 18
|
||||
2022-11-13 16:28:28 - r - INFO: - Episode: 40/200, Reward: 18.000, Step: 18
|
||||
2022-11-13 16:28:28 - r - INFO: - Current episode 40 has the best eval reward: 20.400
|
||||
2022-11-13 16:28:28 - r - INFO: - Episode: 41/200, Reward: 48.000, Step: 48
|
||||
2022-11-13 16:28:29 - r - INFO: - Episode: 42/200, Reward: 52.000, Step: 52
|
||||
2022-11-13 16:28:29 - r - INFO: - Episode: 43/200, Reward: 33.000, Step: 33
|
||||
2022-11-13 16:28:29 - r - INFO: - Episode: 44/200, Reward: 15.000, Step: 15
|
||||
2022-11-13 16:28:29 - r - INFO: - Episode: 45/200, Reward: 18.000, Step: 18
|
||||
2022-11-13 16:28:29 - r - INFO: - Episode: 46/200, Reward: 22.000, Step: 22
|
||||
2022-11-13 16:28:29 - r - INFO: - Episode: 47/200, Reward: 19.000, Step: 19
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 48/200, Reward: 19.000, Step: 19
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 49/200, Reward: 11.000, Step: 11
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 50/200, Reward: 9.000, Step: 9
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 51/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 52/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 53/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 54/200, Reward: 10.000, Step: 10
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 55/200, Reward: 9.000, Step: 9
|
||||
2022-11-13 16:28:30 - r - INFO: - Episode: 56/200, Reward: 17.000, Step: 17
|
||||
2022-11-13 16:28:31 - r - INFO: - Episode: 57/200, Reward: 75.000, Step: 75
|
||||
2022-11-13 16:28:31 - r - INFO: - Episode: 58/200, Reward: 28.000, Step: 28
|
||||
2022-11-13 16:28:31 - r - INFO: - Episode: 59/200, Reward: 30.000, Step: 30
|
||||
2022-11-13 16:28:32 - r - INFO: - Episode: 60/200, Reward: 54.000, Step: 54
|
||||
2022-11-13 16:28:32 - r - INFO: - Current episode 60 has the best eval reward: 34.600
|
||||
2022-11-13 16:28:32 - r - INFO: - Episode: 61/200, Reward: 22.000, Step: 22
|
||||
2022-11-13 16:28:32 - r - INFO: - Episode: 62/200, Reward: 28.000, Step: 28
|
||||
2022-11-13 16:28:32 - r - INFO: - Episode: 63/200, Reward: 26.000, Step: 26
|
||||
2022-11-13 16:28:33 - r - INFO: - Episode: 64/200, Reward: 32.000, Step: 32
|
||||
2022-11-13 16:28:33 - r - INFO: - Episode: 65/200, Reward: 30.000, Step: 30
|
||||
2022-11-13 16:28:33 - r - INFO: - Episode: 66/200, Reward: 29.000, Step: 29
|
||||
2022-11-13 16:28:34 - r - INFO: - Episode: 67/200, Reward: 28.000, Step: 28
|
||||
2022-11-13 16:28:34 - r - INFO: - Episode: 68/200, Reward: 38.000, Step: 38
|
||||
2022-11-13 16:28:34 - r - INFO: - Episode: 69/200, Reward: 28.000, Step: 28
|
||||
2022-11-13 16:28:34 - r - INFO: - Episode: 70/200, Reward: 22.000, Step: 22
|
||||
2022-11-13 16:28:34 - r - INFO: - Current episode 70 has the best eval reward: 36.700
|
||||
2022-11-13 16:28:35 - r - INFO: - Episode: 71/200, Reward: 40.000, Step: 40
|
||||
2022-11-13 16:28:35 - r - INFO: - Episode: 72/200, Reward: 27.000, Step: 27
|
||||
2022-11-13 16:28:35 - r - INFO: - Episode: 73/200, Reward: 24.000, Step: 24
|
||||
2022-11-13 16:28:35 - r - INFO: - Episode: 74/200, Reward: 47.000, Step: 47
|
||||
2022-11-13 16:28:36 - r - INFO: - Episode: 75/200, Reward: 127.000, Step: 127
|
||||
2022-11-13 16:28:37 - r - INFO: - Episode: 76/200, Reward: 48.000, Step: 48
|
||||
2022-11-13 16:28:37 - r - INFO: - Episode: 77/200, Reward: 27.000, Step: 27
|
||||
2022-11-13 16:28:37 - r - INFO: - Episode: 78/200, Reward: 65.000, Step: 65
|
||||
2022-11-13 16:28:38 - r - INFO: - Episode: 79/200, Reward: 75.000, Step: 75
|
||||
2022-11-13 16:28:38 - r - INFO: - Episode: 80/200, Reward: 47.000, Step: 47
|
||||
2022-11-13 16:28:38 - r - INFO: - Current episode 80 has the best eval reward: 37.200
|
||||
2022-11-13 16:28:39 - r - INFO: - Episode: 81/200, Reward: 34.000, Step: 34
|
||||
2022-11-13 16:28:39 - r - INFO: - Episode: 82/200, Reward: 38.000, Step: 38
|
||||
2022-11-13 16:28:39 - r - INFO: - Episode: 83/200, Reward: 24.000, Step: 24
|
||||
2022-11-13 16:28:39 - r - INFO: - Episode: 84/200, Reward: 47.000, Step: 47
|
||||
2022-11-13 16:28:40 - r - INFO: - Episode: 85/200, Reward: 35.000, Step: 35
|
||||
2022-11-13 16:28:40 - r - INFO: - Current episode 85 has the best eval reward: 66.900
|
||||
2022-11-13 16:28:41 - r - INFO: - Episode: 86/200, Reward: 103.000, Step: 103
|
||||
2022-11-13 16:28:41 - r - INFO: - Episode: 87/200, Reward: 64.000, Step: 64
|
||||
2022-11-13 16:28:42 - r - INFO: - Episode: 88/200, Reward: 59.000, Step: 59
|
||||
2022-11-13 16:28:43 - r - INFO: - Episode: 89/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:44 - r - INFO: - Episode: 90/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:46 - r - INFO: - Current episode 90 has the best eval reward: 200.000
|
||||
2022-11-13 16:28:47 - r - INFO: - Episode: 91/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:48 - r - INFO: - Episode: 92/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:50 - r - INFO: - Episode: 93/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:51 - r - INFO: - Episode: 94/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:52 - r - INFO: - Episode: 95/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:54 - r - INFO: - Current episode 95 has the best eval reward: 200.000
|
||||
2022-11-13 16:28:55 - r - INFO: - Episode: 96/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:56 - r - INFO: - Episode: 97/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:58 - r - INFO: - Episode: 98/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:28:59 - r - INFO: - Episode: 99/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:00 - r - INFO: - Episode: 100/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:02 - r - INFO: - Current episode 100 has the best eval reward: 200.000
|
||||
2022-11-13 16:29:04 - r - INFO: - Episode: 101/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:05 - r - INFO: - Episode: 102/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:06 - r - INFO: - Episode: 103/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:07 - r - INFO: - Episode: 104/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:09 - r - INFO: - Episode: 105/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:10 - r - INFO: - Current episode 105 has the best eval reward: 200.000
|
||||
2022-11-13 16:29:11 - r - INFO: - Episode: 106/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:13 - r - INFO: - Episode: 107/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:14 - r - INFO: - Episode: 108/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:16 - r - INFO: - Episode: 109/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:17 - r - INFO: - Episode: 110/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:20 - r - INFO: - Episode: 111/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:21 - r - INFO: - Episode: 112/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:22 - r - INFO: - Episode: 113/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:23 - r - INFO: - Episode: 114/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:25 - r - INFO: - Episode: 115/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:26 - r - INFO: - Current episode 115 has the best eval reward: 200.000
|
||||
2022-11-13 16:29:27 - r - INFO: - Episode: 116/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:29 - r - INFO: - Episode: 117/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:30 - r - INFO: - Episode: 118/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:31 - r - INFO: - Episode: 119/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:33 - r - INFO: - Episode: 120/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:34 - r - INFO: - Current episode 120 has the best eval reward: 200.000
|
||||
2022-11-13 16:29:35 - r - INFO: - Episode: 121/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:37 - r - INFO: - Episode: 122/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:38 - r - INFO: - Episode: 123/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:39 - r - INFO: - Episode: 124/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:41 - r - INFO: - Episode: 125/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:43 - r - INFO: - Episode: 126/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:45 - r - INFO: - Episode: 127/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:46 - r - INFO: - Episode: 128/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:47 - r - INFO: - Episode: 129/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:49 - r - INFO: - Episode: 130/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:51 - r - INFO: - Episode: 131/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:53 - r - INFO: - Episode: 132/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:54 - r - INFO: - Episode: 133/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:55 - r - INFO: - Episode: 134/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:57 - r - INFO: - Episode: 135/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:29:59 - r - INFO: - Episode: 136/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:01 - r - INFO: - Episode: 137/200, Reward: 185.000, Step: 185
|
||||
2022-11-13 16:30:02 - r - INFO: - Episode: 138/200, Reward: 193.000, Step: 193
|
||||
2022-11-13 16:30:03 - r - INFO: - Episode: 139/200, Reward: 192.000, Step: 192
|
||||
2022-11-13 16:30:04 - r - INFO: - Episode: 140/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:07 - r - INFO: - Episode: 141/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:08 - r - INFO: - Episode: 142/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:10 - r - INFO: - Episode: 143/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:11 - r - INFO: - Episode: 144/200, Reward: 191.000, Step: 191
|
||||
2022-11-13 16:30:12 - r - INFO: - Episode: 145/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:15 - r - INFO: - Episode: 146/200, Reward: 184.000, Step: 184
|
||||
2022-11-13 16:30:17 - r - INFO: - Episode: 147/200, Reward: 198.000, Step: 198
|
||||
2022-11-13 16:30:18 - r - INFO: - Episode: 148/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:19 - r - INFO: - Episode: 149/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:21 - r - INFO: - Episode: 150/200, Reward: 192.000, Step: 192
|
||||
2022-11-13 16:30:23 - r - INFO: - Episode: 151/200, Reward: 186.000, Step: 186
|
||||
2022-11-13 16:30:25 - r - INFO: - Episode: 152/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:26 - r - INFO: - Episode: 153/200, Reward: 194.000, Step: 194
|
||||
2022-11-13 16:30:27 - r - INFO: - Episode: 154/200, Reward: 199.000, Step: 199
|
||||
2022-11-13 16:30:29 - r - INFO: - Episode: 155/200, Reward: 183.000, Step: 183
|
||||
2022-11-13 16:30:32 - r - INFO: - Episode: 156/200, Reward: 173.000, Step: 173
|
||||
2022-11-13 16:30:33 - r - INFO: - Episode: 157/200, Reward: 197.000, Step: 197
|
||||
2022-11-13 16:30:34 - r - INFO: - Episode: 158/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:36 - r - INFO: - Episode: 159/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:37 - r - INFO: - Episode: 160/200, Reward: 196.000, Step: 196
|
||||
2022-11-13 16:30:40 - r - INFO: - Episode: 161/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:42 - r - INFO: - Episode: 162/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:43 - r - INFO: - Episode: 163/200, Reward: 194.000, Step: 194
|
||||
2022-11-13 16:30:44 - r - INFO: - Episode: 164/200, Reward: 185.000, Step: 185
|
||||
2022-11-13 16:30:45 - r - INFO: - Episode: 165/200, Reward: 173.000, Step: 173
|
||||
2022-11-13 16:30:48 - r - INFO: - Episode: 166/200, Reward: 192.000, Step: 192
|
||||
2022-11-13 16:30:49 - r - INFO: - Episode: 167/200, Reward: 164.000, Step: 164
|
||||
2022-11-13 16:30:50 - r - INFO: - Episode: 168/200, Reward: 188.000, Step: 188
|
||||
2022-11-13 16:30:52 - r - INFO: - Episode: 169/200, Reward: 189.000, Step: 189
|
||||
2022-11-13 16:30:53 - r - INFO: - Episode: 170/200, Reward: 197.000, Step: 197
|
||||
2022-11-13 16:30:55 - r - INFO: - Episode: 171/200, Reward: 187.000, Step: 187
|
||||
2022-11-13 16:30:57 - r - INFO: - Episode: 172/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:30:58 - r - INFO: - Episode: 173/200, Reward: 195.000, Step: 195
|
||||
2022-11-13 16:30:59 - r - INFO: - Episode: 174/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:01 - r - INFO: - Episode: 175/200, Reward: 195.000, Step: 195
|
||||
2022-11-13 16:31:03 - r - INFO: - Episode: 176/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:05 - r - INFO: - Episode: 177/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:06 - r - INFO: - Episode: 178/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:07 - r - INFO: - Episode: 179/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:09 - r - INFO: - Episode: 180/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:11 - r - INFO: - Episode: 181/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:13 - r - INFO: - Episode: 182/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:14 - r - INFO: - Episode: 183/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:15 - r - INFO: - Episode: 184/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:17 - r - INFO: - Episode: 185/200, Reward: 173.000, Step: 173
|
||||
2022-11-13 16:31:19 - r - INFO: - Episode: 186/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:21 - r - INFO: - Episode: 187/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:22 - r - INFO: - Episode: 188/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:23 - r - INFO: - Episode: 189/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:24 - r - INFO: - Episode: 190/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:26 - r - INFO: - Current episode 190 has the best eval reward: 200.000
|
||||
2022-11-13 16:31:27 - r - INFO: - Episode: 191/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:29 - r - INFO: - Episode: 192/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:30 - r - INFO: - Episode: 193/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:31 - r - INFO: - Episode: 194/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:33 - r - INFO: - Episode: 195/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:34 - r - INFO: - Current episode 195 has the best eval reward: 200.000
|
||||
2022-11-13 16:31:35 - r - INFO: - Episode: 196/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:37 - r - INFO: - Episode: 197/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:38 - r - INFO: - Episode: 198/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:39 - r - INFO: - Episode: 199/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:40 - r - INFO: - Episode: 200/200, Reward: 200.000, Step: 200
|
||||
2022-11-13 16:31:42 - r - INFO: - Current episode 200 has the best eval reward: 200.000
|
||||
2022-11-13 16:31:42 - r - INFO: - Finish training!
|
||||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 48 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards,steps
|
||||
0,18.0,18
|
||||
1,35.0,35
|
||||
2,13.0,13
|
||||
3,20.0,20
|
||||
4,24.0,24
|
||||
5,10.0,10
|
||||
6,20.0,20
|
||||
7,19.0,19
|
||||
8,30.0,30
|
||||
9,10.0,10
|
||||
10,16.0,16
|
||||
11,16.0,16
|
||||
12,12.0,12
|
||||
13,28.0,28
|
||||
14,22.0,22
|
||||
15,14.0,14
|
||||
16,9.0,9
|
||||
17,13.0,13
|
||||
18,19.0,19
|
||||
19,10.0,10
|
||||
20,10.0,10
|
||||
21,12.0,12
|
||||
22,9.0,9
|
||||
23,12.0,12
|
||||
24,11.0,11
|
||||
25,11.0,11
|
||||
26,13.0,13
|
||||
27,11.0,11
|
||||
28,13.0,13
|
||||
29,20.0,20
|
||||
30,16.0,16
|
||||
31,9.0,9
|
||||
32,16.0,16
|
||||
33,15.0,15
|
||||
34,12.0,12
|
||||
35,12.0,12
|
||||
36,16.0,16
|
||||
37,13.0,13
|
||||
38,18.0,18
|
||||
39,18.0,18
|
||||
40,48.0,48
|
||||
41,52.0,52
|
||||
42,33.0,33
|
||||
43,15.0,15
|
||||
44,18.0,18
|
||||
45,22.0,22
|
||||
46,19.0,19
|
||||
47,19.0,19
|
||||
48,11.0,11
|
||||
49,9.0,9
|
||||
50,10.0,10
|
||||
51,10.0,10
|
||||
52,10.0,10
|
||||
53,10.0,10
|
||||
54,9.0,9
|
||||
55,17.0,17
|
||||
56,75.0,75
|
||||
57,28.0,28
|
||||
58,30.0,30
|
||||
59,54.0,54
|
||||
60,22.0,22
|
||||
61,28.0,28
|
||||
62,26.0,26
|
||||
63,32.0,32
|
||||
64,30.0,30
|
||||
65,29.0,29
|
||||
66,28.0,28
|
||||
67,38.0,38
|
||||
68,28.0,28
|
||||
69,22.0,22
|
||||
70,40.0,40
|
||||
71,27.0,27
|
||||
72,24.0,24
|
||||
73,47.0,47
|
||||
74,127.0,127
|
||||
75,48.0,48
|
||||
76,27.0,27
|
||||
77,65.0,65
|
||||
78,75.0,75
|
||||
79,47.0,47
|
||||
80,34.0,34
|
||||
81,38.0,38
|
||||
82,24.0,24
|
||||
83,47.0,47
|
||||
84,35.0,35
|
||||
85,103.0,103
|
||||
86,64.0,64
|
||||
87,59.0,59
|
||||
88,200.0,200
|
||||
89,200.0,200
|
||||
90,200.0,200
|
||||
91,200.0,200
|
||||
92,200.0,200
|
||||
93,200.0,200
|
||||
94,200.0,200
|
||||
95,200.0,200
|
||||
96,200.0,200
|
||||
97,200.0,200
|
||||
98,200.0,200
|
||||
99,200.0,200
|
||||
100,200.0,200
|
||||
101,200.0,200
|
||||
102,200.0,200
|
||||
103,200.0,200
|
||||
104,200.0,200
|
||||
105,200.0,200
|
||||
106,200.0,200
|
||||
107,200.0,200
|
||||
108,200.0,200
|
||||
109,200.0,200
|
||||
110,200.0,200
|
||||
111,200.0,200
|
||||
112,200.0,200
|
||||
113,200.0,200
|
||||
114,200.0,200
|
||||
115,200.0,200
|
||||
116,200.0,200
|
||||
117,200.0,200
|
||||
118,200.0,200
|
||||
119,200.0,200
|
||||
120,200.0,200
|
||||
121,200.0,200
|
||||
122,200.0,200
|
||||
123,200.0,200
|
||||
124,200.0,200
|
||||
125,200.0,200
|
||||
126,200.0,200
|
||||
127,200.0,200
|
||||
128,200.0,200
|
||||
129,200.0,200
|
||||
130,200.0,200
|
||||
131,200.0,200
|
||||
132,200.0,200
|
||||
133,200.0,200
|
||||
134,200.0,200
|
||||
135,200.0,200
|
||||
136,185.0,185
|
||||
137,193.0,193
|
||||
138,192.0,192
|
||||
139,200.0,200
|
||||
140,200.0,200
|
||||
141,200.0,200
|
||||
142,200.0,200
|
||||
143,191.0,191
|
||||
144,200.0,200
|
||||
145,184.0,184
|
||||
146,198.0,198
|
||||
147,200.0,200
|
||||
148,200.0,200
|
||||
149,192.0,192
|
||||
150,186.0,186
|
||||
151,200.0,200
|
||||
152,194.0,194
|
||||
153,199.0,199
|
||||
154,183.0,183
|
||||
155,173.0,173
|
||||
156,197.0,197
|
||||
157,200.0,200
|
||||
158,200.0,200
|
||||
159,196.0,196
|
||||
160,200.0,200
|
||||
161,200.0,200
|
||||
162,194.0,194
|
||||
163,185.0,185
|
||||
164,173.0,173
|
||||
165,192.0,192
|
||||
166,164.0,164
|
||||
167,188.0,188
|
||||
168,189.0,189
|
||||
169,197.0,197
|
||||
170,187.0,187
|
||||
171,200.0,200
|
||||
172,195.0,195
|
||||
173,200.0,200
|
||||
174,195.0,195
|
||||
175,200.0,200
|
||||
176,200.0,200
|
||||
177,200.0,200
|
||||
178,200.0,200
|
||||
179,200.0,200
|
||||
180,200.0,200
|
||||
181,200.0,200
|
||||
182,200.0,200
|
||||
183,200.0,200
|
||||
184,173.0,173
|
||||
185,200.0,200
|
||||
186,200.0,200
|
||||
187,200.0,200
|
||||
188,200.0,200
|
||||
189,200.0,200
|
||||
190,200.0,200
|
||||
191,200.0,200
|
||||
192,200.0,200
|
||||
193,200.0,200
|
||||
194,200.0,200
|
||||
195,200.0,200
|
||||
196,200.0,200
|
||||
197,200.0,200
|
||||
198,200.0,200
|
||||
199,200.0,200
|
||||
|
22
projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Test.yaml
Normal file
22
projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Test.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
general_cfg:
|
||||
algo_name: PER_DQN
|
||||
device: cpu
|
||||
env_name: CartPole-v1
|
||||
mode: test
|
||||
load_checkpoint: true
|
||||
load_path: Train_CartPole-v1_PER_DQN_20221113-162804
|
||||
max_steps: 200
|
||||
save_fig: true
|
||||
seed: 0
|
||||
show_fig: false
|
||||
test_eps: 10
|
||||
train_eps: 200
|
||||
algo_cfg:
|
||||
batch_size: 64
|
||||
buffer_size: 100000
|
||||
epsilon_decay: 500
|
||||
epsilon_end: 0.01
|
||||
epsilon_start: 0.95
|
||||
gamma: 0.95
|
||||
lr: 0.0001
|
||||
target_update: 4
|
||||
22
projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Train.yaml
Normal file
22
projects/codes/PER_DQN/config/CartPole-v1_PER_DQN_Train.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
general_cfg:
|
||||
algo_name: PER_DQN
|
||||
device: cuda
|
||||
env_name: CartPole-v1
|
||||
mode: train
|
||||
load_checkpoint: false
|
||||
load_path: Train_CartPole-v1_PER_DQN_20221026-054757
|
||||
max_steps: 200
|
||||
save_fig: true
|
||||
seed: 0
|
||||
show_fig: false
|
||||
test_eps: 10
|
||||
train_eps: 200
|
||||
algo_cfg:
|
||||
batch_size: 64
|
||||
buffer_size: 100000
|
||||
epsilon_decay: 500
|
||||
epsilon_end: 0.01
|
||||
epsilon_start: 0.95
|
||||
gamma: 0.95
|
||||
lr: 0.0001
|
||||
target_update: 4
|
||||
38
projects/codes/PER_DQN/config/config.py
Normal file
38
projects/codes/PER_DQN/config/config.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2022-10-30 00:37:33
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-10-30 01:19:08
|
||||
Discription: default parameters of DQN
|
||||
'''
|
||||
from common.config import GeneralConfig,AlgoConfig
|
||||
class GeneralConfigDQN(GeneralConfig):
|
||||
def __init__(self) -> None:
|
||||
self.env_name = "CartPole-v1" # name of environment
|
||||
self.algo_name = "PER_DQN" # name of algorithm
|
||||
self.mode = "train" # train or test
|
||||
self.seed = 1 # random seed
|
||||
self.device = "cuda" # device to use
|
||||
self.train_eps = 200 # number of episodes for training
|
||||
self.test_eps = 10 # number of episodes for testing
|
||||
self.max_steps = 200 # max steps for each episode
|
||||
self.load_checkpoint = False
|
||||
self.load_path = "tasks" # path to load model
|
||||
self.show_fig = False # show figure or not
|
||||
self.save_fig = True # save figure or not
|
||||
|
||||
class AlgoConfigDQN(AlgoConfig):
|
||||
def __init__(self) -> None:
|
||||
# set epsilon_start=epsilon_end can obtain fixed epsilon=epsilon_end
|
||||
self.epsilon_start = 0.95 # epsilon start value
|
||||
self.epsilon_end = 0.01 # epsilon end value
|
||||
self.epsilon_decay = 500 # epsilon decay rate
|
||||
self.hidden_dim = 256 # hidden_dim for MLP
|
||||
self.gamma = 0.95 # discount factor
|
||||
self.lr = 0.0001 # learning rate
|
||||
self.buffer_size = 100000 # size of replay buffer
|
||||
self.batch_size = 64 # batch size
|
||||
self.target_update = 4 # target network update frequency
|
||||
139
projects/codes/PER_DQN/per_dqn.py
Normal file
139
projects/codes/PER_DQN/per_dqn.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: DingLi
|
||||
Email: wangzhongren@sjtu.edu.cn
|
||||
Date: 2022-10-31 22:54:00
|
||||
LastEditor: DingLi
|
||||
LastEditTime: 2022-11-14 10:43:18
|
||||
Discription: CartPole-v1
|
||||
'''
|
||||
|
||||
'''
|
||||
@Author: John
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-12 00:50:49
|
||||
@LastEditor: John
|
||||
LastEditTime: 2022-10-26 07:50:24
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
'''off-policy
|
||||
'''
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import random
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class PER_DQN:
|
||||
def __init__(self,model,memory,cfg):
|
||||
|
||||
self.n_actions = cfg.n_actions
|
||||
self.device = torch.device(cfg.device)
|
||||
self.gamma = cfg.gamma
|
||||
## e-greedy parameters
|
||||
self.sample_count = 0 # sample count for epsilon decay
|
||||
self.epsilon = cfg.epsilon_start
|
||||
self.sample_count = 0
|
||||
self.epsilon_start = cfg.epsilon_start
|
||||
self.epsilon_end = cfg.epsilon_end
|
||||
self.epsilon_decay = cfg.epsilon_decay
|
||||
self.batch_size = cfg.batch_size
|
||||
self.policy_net = model.to(self.device)
|
||||
self.target_net = model.to(self.device)
|
||||
## copy parameters from policy net to target net
|
||||
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()):
|
||||
target_param.data.copy_(param.data)
|
||||
# self.target_net.load_state_dict(self.policy_net.state_dict()) # or use this to copy parameters
|
||||
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.memory = memory
|
||||
self.update_flag = False
|
||||
|
||||
def sample_action(self, state):
|
||||
''' sample action with e-greedy policy
|
||||
'''
|
||||
self.sample_count += 1
|
||||
# epsilon must decay(linear,exponential and etc.) for balancing exploration and exploitation
|
||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||
math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||
if random.random() > self.epsilon:
|
||||
with torch.no_grad():
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # choose action corresponding to the maximum q value
|
||||
else:
|
||||
action = random.randrange(self.n_actions)
|
||||
return action
|
||||
# @torch.no_grad()
|
||||
# def sample_action(self, state):
|
||||
# ''' sample action with e-greedy policy
|
||||
# '''
|
||||
# self.sample_count += 1
|
||||
# # epsilon must decay(linear,exponential and etc.) for balancing exploration and exploitation
|
||||
# self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||
# math.exp(-1. * self.sample_count / self.epsilon_decay)
|
||||
# if random.random() > self.epsilon:
|
||||
# state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
# q_values = self.policy_net(state)
|
||||
# action = q_values.max(1)[1].item() # choose action corresponding to the maximum q value
|
||||
# else:
|
||||
# action = random.randrange(self.n_actions)
|
||||
# return action
|
||||
def predict_action(self,state):
|
||||
''' predict action
|
||||
'''
|
||||
with torch.no_grad():
|
||||
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # choose action corresponding to the maximum q value
|
||||
return action
|
||||
def update(self):
|
||||
if len(self.memory) < self.batch_size: # when transitions in memory donot meet a batch, not update
|
||||
# print ("self.batch_size = ", self.batch_size)
|
||||
return
|
||||
else:
|
||||
if not self.update_flag:
|
||||
print("Begin to update!")
|
||||
self.update_flag = True
|
||||
# sample a batch of transitions from replay buffer
|
||||
(state_batch, action_batch, reward_batch, next_state_batch, done_batch), idxs_batch, is_weights_batch = self.memory.sample(
|
||||
self.batch_size)
|
||||
state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float).unsqueeze(1) # shape(batchsize,1)
|
||||
next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float) # shape(batchsize,n_states)
|
||||
done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1) # shape(batchsize,1)
|
||||
q_value_batch = self.policy_net(state_batch).gather(dim=1, index=action_batch) # shape(batchsize,1),requires_grad=True
|
||||
next_max_q_value_batch = self.target_net(next_state_batch).max(1)[0].detach().unsqueeze(1)
|
||||
expected_q_value_batch = reward_batch + self.gamma * next_max_q_value_batch* (1-done_batch)
|
||||
|
||||
loss = torch.mean(torch.pow((q_value_batch - expected_q_value_batch) * torch.from_numpy(is_weights_batch).cuda(), 2))
|
||||
# loss = nn.MSELoss()(q_value_batch, expected_q_value_batch) # shape same to
|
||||
|
||||
abs_errors = np.sum(np.abs(q_value_batch.cpu().detach().numpy() - expected_q_value_batch.cpu().detach().numpy()), axis=1)
|
||||
self.memory.batch_update(idxs_batch, abs_errors)
|
||||
|
||||
# backpropagation
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
# clip to avoid gradient explosion
|
||||
for param in self.policy_net.parameters():
|
||||
param.grad.data.clamp_(-1, 1)
|
||||
self.optimizer.step()
|
||||
if self.sample_count % self.target_update == 0: # target net update, target_update means "C" in pseucodes
|
||||
self.target_net.load_state_dict(self.policy_net.state_dict())
|
||||
|
||||
def save_model(self, fpath):
|
||||
from pathlib import Path
|
||||
# create path
|
||||
Path(fpath).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.target_net.state_dict(), f"{fpath}/checkpoint.pt")
|
||||
|
||||
def load_model(self, fpath):
|
||||
checkpoint = torch.load(f"{fpath}/checkpoint.pt",map_location=self.device)
|
||||
self.target_net.load_state_dict(checkpoint)
|
||||
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
|
||||
param.data.copy_(target_param.data)
|
||||
104
projects/codes/PER_DQN/task0.py
Normal file
104
projects/codes/PER_DQN/task0.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: DingLi
|
||||
Email: wangzhongren@sjtu.edu.cn
|
||||
Date: 2022-10-31 22:54:00
|
||||
LastEditor: DingLi
|
||||
LastEditTime: 2022-11-14 10:45:11
|
||||
Discription: CartPole-v1
|
||||
'''
|
||||
|
||||
'''
|
||||
Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2022-10-12 11:09:54
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-10-30 01:29:25
|
||||
Discription: CartPole-v1,Acrobot-v1
|
||||
'''
|
||||
import sys,os
|
||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
|
||||
parent_path = os.path.dirname(curr_path) # parent path
|
||||
sys.path.append(parent_path) # add to system path
|
||||
import gym
|
||||
import torch
|
||||
|
||||
from common.utils import all_seed,merge_class_attrs
|
||||
from common.models import MLP
|
||||
from common.memories import ReplayBuffer, ReplayTree
|
||||
from common.launcher import Launcher
|
||||
from envs.register import register_env
|
||||
from per_dqn import PER_DQN
|
||||
from config.config import GeneralConfigDQN,AlgoConfigDQN
|
||||
class Main(Launcher):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.cfgs['general_cfg'] = merge_class_attrs(self.cfgs['general_cfg'],GeneralConfigDQN())
|
||||
self.cfgs['algo_cfg'] = merge_class_attrs(self.cfgs['algo_cfg'],AlgoConfigDQN())
|
||||
def env_agent_config(self,cfg,logger):
|
||||
''' create env and agent
|
||||
'''
|
||||
register_env(cfg.env_name)
|
||||
env = gym.make(cfg.env_name,new_step_api=True) # create env
|
||||
all_seed(env,seed=cfg.seed) # set random seed
|
||||
try: # state dimension
|
||||
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
|
||||
except AttributeError:
|
||||
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
|
||||
n_actions = env.action_space.n # action dimension
|
||||
logger.info(f"n_states: {n_states}, n_actions: {n_actions}") # print info
|
||||
# update to cfg paramters
|
||||
setattr(cfg, 'n_states', n_states)
|
||||
setattr(cfg, 'n_actions', n_actions)
|
||||
# cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
|
||||
model = MLP(n_states,n_actions,hidden_dim=cfg.hidden_dim)
|
||||
memory = ReplayTree(cfg.buffer_size) # replay SumTree
|
||||
agent = PER_DQN(model,memory,cfg) # create agent
|
||||
return env, agent
|
||||
|
||||
def train_one_episode(self,env, agent, cfg):
|
||||
''' train one episode
|
||||
'''
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg.max_steps):
|
||||
ep_step += 1
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions under new_step_api of OpenAI Gym
|
||||
|
||||
policy_val = agent.policy_net(torch.tensor(state, device = cfg.device))[action]
|
||||
target_val = agent.target_net(torch.tensor(next_state, device = cfg.device))
|
||||
|
||||
if terminated:
|
||||
error = abs(policy_val - reward)
|
||||
else:
|
||||
error = abs(policy_val - reward - cfg.gamma * torch.max(target_val))
|
||||
agent.memory.push(error.cpu().detach().numpy(), (state, action, reward,
|
||||
next_state, terminated)) # save transitions
|
||||
state = next_state # update next state for env
|
||||
agent.update() # update agent
|
||||
ep_reward += reward #
|
||||
if terminated:
|
||||
break
|
||||
return agent, ep_reward, ep_step
|
||||
|
||||
def test_one_episode(self, env, agent, cfg):
|
||||
ep_reward = 0 # reward per episode
|
||||
ep_step = 0
|
||||
state = env.reset() # reset and obtain initial state
|
||||
for _ in range(cfg.max_steps):
|
||||
ep_step+=1
|
||||
action = agent.predict_action(state) # predict action
|
||||
next_state, reward, terminated, _, _ = env.step(action)
|
||||
state = next_state
|
||||
ep_reward += reward
|
||||
if terminated:
|
||||
break
|
||||
return agent, ep_reward, ep_step
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main = Main()
|
||||
main.run()
|
||||
|
||||
@@ -36,11 +36,11 @@ class Launcher:
|
||||
ep_reward = 0
|
||||
ep_step = 0
|
||||
return agent,ep_reward,ep_step
|
||||
def test_one_episode(self,env, agent, cfg):
|
||||
def test_one_episode(self, env, agent, cfg):
|
||||
ep_reward = 0
|
||||
ep_step = 0
|
||||
return agent,ep_reward,ep_step
|
||||
def evaluate(self,env, agent, cfg):
|
||||
def evaluate(self, env, agent, cfg):
|
||||
sum_eval_reward = 0
|
||||
for _ in range(cfg.eval_eps):
|
||||
_,eval_ep_reward,_ = self.test_one_episode(env, agent, cfg)
|
||||
|
||||
@@ -10,6 +10,7 @@ LastEditTime: 2022-08-28 23:44:06
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
import random
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
class ReplayBuffer:
|
||||
def __init__(self, capacity):
|
||||
@@ -71,4 +72,136 @@ class PGReplay(ReplayBufferQue):
|
||||
''' sample all the transitions
|
||||
'''
|
||||
batch = list(self.buffer)
|
||||
return zip(*batch)
|
||||
return zip(*batch)
|
||||
|
||||
class SumTree:
|
||||
'''SumTree for the per(Prioritized Experience Replay) DQN.
|
||||
This SumTree code is a modified version and the original code is from:
|
||||
https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/contents/5.2_Prioritized_Replay_DQN/RL_brain.py
|
||||
'''
|
||||
def __init__(self, capacity: int):
|
||||
self.capacity = capacity
|
||||
self.data_pointer = 0
|
||||
self.n_entries = 0
|
||||
self.tree = np.zeros(2 * capacity - 1)
|
||||
self.data = np.zeros(capacity, dtype = object)
|
||||
|
||||
def update(self, tree_idx, p):
|
||||
'''Update the sampling weight
|
||||
'''
|
||||
change = p - self.tree[tree_idx]
|
||||
self.tree[tree_idx] = p
|
||||
|
||||
while tree_idx != 0:
|
||||
tree_idx = (tree_idx - 1) // 2
|
||||
self.tree[tree_idx] += change
|
||||
|
||||
def add(self, p, data):
|
||||
'''Adding new data to the sumTree
|
||||
'''
|
||||
tree_idx = self.data_pointer + self.capacity - 1
|
||||
self.data[self.data_pointer] = data
|
||||
# print ("tree_idx=", tree_idx)
|
||||
# print ("nonzero = ", np.count_nonzero(self.tree))
|
||||
self.update(tree_idx, p)
|
||||
|
||||
self.data_pointer += 1
|
||||
if self.data_pointer >= self.capacity:
|
||||
self.data_pointer = 0
|
||||
|
||||
if self.n_entries < self.capacity:
|
||||
self.n_entries += 1
|
||||
|
||||
def get_leaf(self, v):
|
||||
'''Sampling the data
|
||||
'''
|
||||
parent_idx = 0
|
||||
while True:
|
||||
cl_idx = 2 * parent_idx + 1
|
||||
cr_idx = cl_idx + 1
|
||||
if cl_idx >= len(self.tree):
|
||||
leaf_idx = parent_idx
|
||||
break
|
||||
else:
|
||||
if v <= self.tree[cl_idx] :
|
||||
parent_idx = cl_idx
|
||||
else:
|
||||
v -= self.tree[cl_idx]
|
||||
parent_idx = cr_idx
|
||||
|
||||
data_idx = leaf_idx - self.capacity + 1
|
||||
return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
|
||||
|
||||
def total(self):
|
||||
return int(self.tree[0])
|
||||
|
||||
class ReplayTree:
|
||||
'''ReplayTree for the per(Prioritized Experience Replay) DQN.
|
||||
'''
|
||||
def __init__(self, capacity):
|
||||
self.capacity = capacity # the capacity for memory replay
|
||||
self.tree = SumTree(capacity)
|
||||
self.abs_err_upper = 1.
|
||||
|
||||
## hyper parameter for calculating the importance sampling weight
|
||||
self.beta_increment_per_sampling = 0.001
|
||||
self.alpha = 0.6
|
||||
self.beta = 0.4
|
||||
self.epsilon = 0.01
|
||||
self.abs_err_upper = 1.
|
||||
|
||||
def __len__(self):
|
||||
''' return the num of storage
|
||||
'''
|
||||
return self.tree.total()
|
||||
|
||||
def push(self, error, sample):
|
||||
'''Push the sample into the replay according to the importance sampling weight
|
||||
'''
|
||||
p = (np.abs(error) + self.epsilon) ** self.alpha
|
||||
self.tree.add(p, sample)
|
||||
|
||||
|
||||
def sample(self, batch_size):
|
||||
'''This is for sampling a batch data and the original code is from:
|
||||
https://github.com/rlcode/per/blob/master/prioritized_memory.py
|
||||
'''
|
||||
pri_segment = self.tree.total() / batch_size
|
||||
|
||||
priorities = []
|
||||
batch = []
|
||||
idxs = []
|
||||
|
||||
is_weights = []
|
||||
|
||||
self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])
|
||||
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total()
|
||||
|
||||
for i in range(batch_size):
|
||||
a = pri_segment * i
|
||||
b = pri_segment * (i+1)
|
||||
|
||||
s = random.uniform(a, b)
|
||||
idx, p, data = self.tree.get_leaf(s)
|
||||
|
||||
priorities.append(p)
|
||||
batch.append(data)
|
||||
idxs.append(idx)
|
||||
prob = p / self.tree.total()
|
||||
|
||||
sampling_probabilities = np.array(priorities) / self.tree.total()
|
||||
is_weights = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
|
||||
is_weights /= is_weights.max()
|
||||
|
||||
return zip(*batch), idxs, is_weights
|
||||
|
||||
def batch_update(self, tree_idx, abs_errors):
|
||||
'''Update the importance sampling weight
|
||||
'''
|
||||
abs_errors += self.epsilon
|
||||
|
||||
clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
|
||||
ps = np.power(clipped_errors, self.alpha)
|
||||
|
||||
for ti, p in zip(tree_idx, ps):
|
||||
self.tree.update(ti, p)
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-03-12 16:02:24
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-10-26 07:38:17
|
||||
LastEditTime: 2022-11-14 10:27:43
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -179,6 +179,8 @@ def all_seed(env,seed = 1):
|
||||
import torch
|
||||
import numpy as np
|
||||
import random
|
||||
if seed == 0:
|
||||
return
|
||||
# print(f"seed = {seed}")
|
||||
env.seed(seed) # env config
|
||||
np.random.seed(seed)
|
||||
|
||||
Reference in New Issue
Block a user