hot update

This commit is contained in:
johnjim0816
2022-08-22 17:50:11 +08:00
parent 0a54840828
commit ad65dd17cd
54 changed files with 1639 additions and 503 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

View File

@@ -0,0 +1,16 @@
{
"algo_name": "PolicyGradient",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.99,
"lr": 0.005,
"update_fre": 8,
"hidden_dim": 36,
"device": "cpu",
"seed": 1,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/models/",
"save_fig": true,
"show_fig": false
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards
0,200.0
1,200.0
2,165.0
3,200.0
4,200.0
5,200.0
6,200.0
7,200.0
8,200.0
9,200.0
10,200.0
11,168.0
12,200.0
13,200.0
14,200.0
15,115.0
16,198.0
17,200.0
18,200.0
19,200.0
1 episodes rewards
2 0 200.0
3 1 200.0
4 2 165.0
5 3 200.0
6 4 200.0
7 5 200.0
8 6 200.0
9 7 200.0
10 8 200.0
11 9 200.0
12 10 200.0
13 11 168.0
14 12 200.0
15 13 200.0
16 14 200.0
17 15 115.0
18 16 198.0
19 17 200.0
20 18 200.0
21 19 200.0

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards
0,26.0
1,53.0
2,10.0
3,37.0
4,22.0
5,21.0
6,12.0
7,34.0
8,38.0
9,40.0
10,23.0
11,14.0
12,16.0
13,25.0
14,15.0
15,23.0
16,11.0
17,28.0
18,21.0
19,62.0
20,33.0
21,27.0
22,15.0
23,17.0
24,26.0
25,35.0
26,26.0
27,14.0
28,42.0
29,45.0
30,34.0
31,39.0
32,31.0
33,17.0
34,42.0
35,41.0
36,31.0
37,39.0
38,28.0
39,12.0
40,36.0
41,33.0
42,47.0
43,40.0
44,63.0
45,36.0
46,64.0
47,79.0
48,49.0
49,40.0
50,65.0
51,47.0
52,51.0
53,30.0
54,26.0
55,41.0
56,86.0
57,61.0
58,38.0
59,200.0
60,49.0
61,70.0
62,61.0
63,101.0
64,200.0
65,152.0
66,108.0
67,46.0
68,72.0
69,87.0
70,27.0
71,126.0
72,46.0
73,25.0
74,14.0
75,42.0
76,38.0
77,55.0
78,42.0
79,51.0
80,67.0
81,83.0
82,178.0
83,115.0
84,140.0
85,97.0
86,85.0
87,61.0
88,153.0
89,200.0
90,200.0
91,200.0
92,200.0
93,64.0
94,200.0
95,200.0
96,157.0
97,128.0
98,160.0
99,35.0
100,140.0
101,113.0
102,200.0
103,154.0
104,200.0
105,200.0
106,200.0
107,198.0
108,137.0
109,200.0
110,200.0
111,102.0
112,200.0
113,200.0
114,200.0
115,200.0
116,148.0
117,200.0
118,200.0
119,200.0
120,200.0
121,200.0
122,194.0
123,200.0
124,200.0
125,200.0
126,183.0
127,200.0
128,200.0
129,200.0
130,200.0
131,200.0
132,200.0
133,200.0
134,200.0
135,200.0
136,93.0
137,96.0
138,84.0
139,103.0
140,79.0
141,104.0
142,82.0
143,105.0
144,200.0
145,200.0
146,171.0
147,200.0
148,200.0
149,200.0
150,200.0
151,197.0
152,133.0
153,142.0
154,147.0
155,156.0
156,131.0
157,181.0
158,163.0
159,146.0
160,200.0
161,176.0
162,200.0
163,173.0
164,177.0
165,200.0
166,200.0
167,200.0
168,200.0
169,200.0
170,200.0
171,200.0
172,200.0
173,200.0
174,200.0
175,200.0
176,200.0
177,200.0
178,200.0
179,200.0
180,200.0
181,200.0
182,200.0
183,200.0
184,200.0
185,200.0
186,200.0
187,200.0
188,200.0
189,200.0
190,200.0
191,200.0
192,200.0
193,200.0
194,200.0
195,200.0
196,190.0
197,200.0
198,189.0
199,200.0
1 episodes rewards
2 0 26.0
3 1 53.0
4 2 10.0
5 3 37.0
6 4 22.0
7 5 21.0
8 6 12.0
9 7 34.0
10 8 38.0
11 9 40.0
12 10 23.0
13 11 14.0
14 12 16.0
15 13 25.0
16 14 15.0
17 15 23.0
18 16 11.0
19 17 28.0
20 18 21.0
21 19 62.0
22 20 33.0
23 21 27.0
24 22 15.0
25 23 17.0
26 24 26.0
27 25 35.0
28 26 26.0
29 27 14.0
30 28 42.0
31 29 45.0
32 30 34.0
33 31 39.0
34 32 31.0
35 33 17.0
36 34 42.0
37 35 41.0
38 36 31.0
39 37 39.0
40 38 28.0
41 39 12.0
42 40 36.0
43 41 33.0
44 42 47.0
45 43 40.0
46 44 63.0
47 45 36.0
48 46 64.0
49 47 79.0
50 48 49.0
51 49 40.0
52 50 65.0
53 51 47.0
54 52 51.0
55 53 30.0
56 54 26.0
57 55 41.0
58 56 86.0
59 57 61.0
60 58 38.0
61 59 200.0
62 60 49.0
63 61 70.0
64 62 61.0
65 63 101.0
66 64 200.0
67 65 152.0
68 66 108.0
69 67 46.0
70 68 72.0
71 69 87.0
72 70 27.0
73 71 126.0
74 72 46.0
75 73 25.0
76 74 14.0
77 75 42.0
78 76 38.0
79 77 55.0
80 78 42.0
81 79 51.0
82 80 67.0
83 81 83.0
84 82 178.0
85 83 115.0
86 84 140.0
87 85 97.0
88 86 85.0
89 87 61.0
90 88 153.0
91 89 200.0
92 90 200.0
93 91 200.0
94 92 200.0
95 93 64.0
96 94 200.0
97 95 200.0
98 96 157.0
99 97 128.0
100 98 160.0
101 99 35.0
102 100 140.0
103 101 113.0
104 102 200.0
105 103 154.0
106 104 200.0
107 105 200.0
108 106 200.0
109 107 198.0
110 108 137.0
111 109 200.0
112 110 200.0
113 111 102.0
114 112 200.0
115 113 200.0
116 114 200.0
117 115 200.0
118 116 148.0
119 117 200.0
120 118 200.0
121 119 200.0
122 120 200.0
123 121 200.0
124 122 194.0
125 123 200.0
126 124 200.0
127 125 200.0
128 126 183.0
129 127 200.0
130 128 200.0
131 129 200.0
132 130 200.0
133 131 200.0
134 132 200.0
135 133 200.0
136 134 200.0
137 135 200.0
138 136 93.0
139 137 96.0
140 138 84.0
141 139 103.0
142 140 79.0
143 141 104.0
144 142 82.0
145 143 105.0
146 144 200.0
147 145 200.0
148 146 171.0
149 147 200.0
150 148 200.0
151 149 200.0
152 150 200.0
153 151 197.0
154 152 133.0
155 153 142.0
156 154 147.0
157 155 156.0
158 156 131.0
159 157 181.0
160 158 163.0
161 159 146.0
162 160 200.0
163 161 176.0
164 162 200.0
165 163 173.0
166 164 177.0
167 165 200.0
168 166 200.0
169 167 200.0
170 168 200.0
171 169 200.0
172 170 200.0
173 171 200.0
174 172 200.0
175 173 200.0
176 174 200.0
177 175 200.0
178 176 200.0
179 177 200.0
180 178 200.0
181 179 200.0
182 180 200.0
183 181 200.0
184 182 200.0
185 183 200.0
186 184 200.0
187 185 200.0
188 186 200.0
189 187 200.0
190 188 200.0
191 189 200.0
192 190 200.0
193 191 200.0
194 192 200.0
195 193 200.0
196 194 200.0
197 195 200.0
198 196 190.0
199 197 200.0
200 198 189.0
201 199 200.0

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44
LastEditor: John
LastEditTime: 2022-02-10 01:25:27
LastEditTime: 2022-08-22 17:35:34
Discription:
Environment:
'''
@@ -16,35 +16,27 @@ from torch.distributions import Bernoulli
from torch.autograd import Variable
import numpy as np
class MLP(nn.Module):
''' 多层感知机
输入state维度
输出:概率
'''
def __init__(self,input_dim,hidden_dim = 36):
super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据input_dim, n_actions的情况来改变
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
class PolicyGradient:
def __init__(self, n_states,cfg):
def __init__(self, n_states,model,memory,cfg):
self.gamma = cfg.gamma
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
self.device = torch.device(cfg.device)
self.memory = memory
self.policy_net = model.to(self.device)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.batch_size = cfg.batch_size
def choose_action(self,state):
def sample_action(self,state):
state = torch.from_numpy(state).float()
state = Variable(state)
probs = self.policy_net(state)
m = Bernoulli(probs) # 伯努利分布
action = m.sample()
action = action.data.numpy().astype(int)[0] # 转为标量
return action
def predict_action(self,state):
state = torch.from_numpy(state).float()
state = Variable(state)
probs = self.policy_net(state)
@@ -53,7 +45,9 @@ class PolicyGradient:
action = action.data.numpy().astype(int)[0] # 转为标量
return action
def update(self,reward_pool,state_pool,action_pool):
def update(self):
state_pool,action_pool,reward_pool= self.memory.sample()
state_pool,action_pool,reward_pool = list(state_pool),list(action_pool),list(reward_pool)
# Discount reward
running_add = 0
for i in reversed(range(len(reward_pool))):
@@ -83,7 +77,11 @@ class PolicyGradient:
# print(loss)
loss.backward()
self.optimizer.step()
def save(self,path):
torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pt')
def load(self,path):
self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pt'))
self.memory.clear()
def save_model(self,path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.policy_net.state_dict(), path+'checkpoint.pt')
def load_model(self,path):
self.policy_net.load_state_dict(torch.load(path+'checkpoint.pt'))

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-07-21 21:44:00
LastEditTime: 2022-08-22 17:40:07
Discription:
Environment:
'''
@@ -19,10 +19,11 @@ import torch
import datetime
import argparse
from itertools import count
import torch.nn.functional as F
from pg import PolicyGradient
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
from common.models import MLP
from common.memories import PGReplay
def get_args():
@@ -32,112 +33,107 @@ def get_args():
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=0.01,type=float,help="learning rate")
parser.add_argument('--batch_size',default=8,type=int)
parser.add_argument('--lr',default=0.005,type=float,help="learning rate")
parser.add_argument('--update_fre',default=8,type=int)
parser.add_argument('--hidden_dim',default=36,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=1,type=int,help="seed")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
args = parser.parse_args([])
return args
class PGNet(MLP):
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
'''
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
def env_agent_config(cfg,seed=1):
def env_agent_config(cfg):
env = gym.make(cfg.env_name)
env.seed(seed)
if cfg.seed !=0: # set random seed
all_seed(env,seed=cfg.seed)
n_states = env.observation_space.shape[0]
agent = PolicyGradient(n_states,cfg)
n_actions = env.action_space.n # action dimension
print(f"state dim: {n_states}, action dim: {n_actions}")
model = PGNet(n_states,1,hidden_dim=cfg.hidden_dim)
memory = PGReplay()
agent = PolicyGradient(n_states,model,memory,cfg)
return env,agent
def train(cfg,env,agent):
print('Start training!')
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
state_pool = [] # temp states pool per several episodes
action_pool = []
reward_pool = []
print(f'Env:{cfg.env_name}, Algo:{cfg.algo_name}, Device:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state_pool.append(state)
action_pool.append(float(action))
reward_pool.append(reward)
agent.memory.push((state,float(action),reward))
state = next_state
if done:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
break
if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool)
state_pool = []
action_pool = []
reward_pool = []
if (i_ep+1) % cfg.update_fre == 0:
agent.update()
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('Finish training!')
env.close() # close environment
return rewards, ma_rewards
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
return res_dic
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
print("start testing!")
print(f"Env: {cfg.env_name}, Algo: {cfg.algo_name}, Device: {cfg.device}")
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
print(f'Episode: {i_ep+1}/{cfg.test_eps}Reward: {ep_reward:.2f}')
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成测试!')
print("finish testing!")
env.close()
return rewards, ma_rewards
return {'episodes':range(len(rewards)),'rewards':rewards}
if __name__ == "__main__":
cfg = Config()
# 训练
cfg = get_args()
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
res_dic = train(cfg, env, agent)
save_args(cfg,path = cfg.result_path) # save parameters
agent.save_model(path = cfg.model_path) # save models
save_results(res_dic, tag = 'train', path = cfg.result_path) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train") # plot results
# testing
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg.model_path) # load model
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path = cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test")