hot update
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 28 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 55 KiB |
Binary file not shown.
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"algo_name": "PolicyGradient",
|
||||
"env_name": "CartPole-v0",
|
||||
"train_eps": 200,
|
||||
"test_eps": 20,
|
||||
"gamma": 0.99,
|
||||
"lr": 0.005,
|
||||
"update_fre": 8,
|
||||
"hidden_dim": 36,
|
||||
"device": "cpu",
|
||||
"seed": 1,
|
||||
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/results/",
|
||||
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220822-174059/models/",
|
||||
"save_fig": true,
|
||||
"show_fig": false
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 35 KiB |
@@ -0,0 +1,21 @@
|
||||
episodes,rewards
|
||||
0,200.0
|
||||
1,200.0
|
||||
2,165.0
|
||||
3,200.0
|
||||
4,200.0
|
||||
5,200.0
|
||||
6,200.0
|
||||
7,200.0
|
||||
8,200.0
|
||||
9,200.0
|
||||
10,200.0
|
||||
11,168.0
|
||||
12,200.0
|
||||
13,200.0
|
||||
14,200.0
|
||||
15,115.0
|
||||
16,198.0
|
||||
17,200.0
|
||||
18,200.0
|
||||
19,200.0
|
||||
|
Binary file not shown.
|
After Width: | Height: | Size: 66 KiB |
@@ -0,0 +1,201 @@
|
||||
episodes,rewards
|
||||
0,26.0
|
||||
1,53.0
|
||||
2,10.0
|
||||
3,37.0
|
||||
4,22.0
|
||||
5,21.0
|
||||
6,12.0
|
||||
7,34.0
|
||||
8,38.0
|
||||
9,40.0
|
||||
10,23.0
|
||||
11,14.0
|
||||
12,16.0
|
||||
13,25.0
|
||||
14,15.0
|
||||
15,23.0
|
||||
16,11.0
|
||||
17,28.0
|
||||
18,21.0
|
||||
19,62.0
|
||||
20,33.0
|
||||
21,27.0
|
||||
22,15.0
|
||||
23,17.0
|
||||
24,26.0
|
||||
25,35.0
|
||||
26,26.0
|
||||
27,14.0
|
||||
28,42.0
|
||||
29,45.0
|
||||
30,34.0
|
||||
31,39.0
|
||||
32,31.0
|
||||
33,17.0
|
||||
34,42.0
|
||||
35,41.0
|
||||
36,31.0
|
||||
37,39.0
|
||||
38,28.0
|
||||
39,12.0
|
||||
40,36.0
|
||||
41,33.0
|
||||
42,47.0
|
||||
43,40.0
|
||||
44,63.0
|
||||
45,36.0
|
||||
46,64.0
|
||||
47,79.0
|
||||
48,49.0
|
||||
49,40.0
|
||||
50,65.0
|
||||
51,47.0
|
||||
52,51.0
|
||||
53,30.0
|
||||
54,26.0
|
||||
55,41.0
|
||||
56,86.0
|
||||
57,61.0
|
||||
58,38.0
|
||||
59,200.0
|
||||
60,49.0
|
||||
61,70.0
|
||||
62,61.0
|
||||
63,101.0
|
||||
64,200.0
|
||||
65,152.0
|
||||
66,108.0
|
||||
67,46.0
|
||||
68,72.0
|
||||
69,87.0
|
||||
70,27.0
|
||||
71,126.0
|
||||
72,46.0
|
||||
73,25.0
|
||||
74,14.0
|
||||
75,42.0
|
||||
76,38.0
|
||||
77,55.0
|
||||
78,42.0
|
||||
79,51.0
|
||||
80,67.0
|
||||
81,83.0
|
||||
82,178.0
|
||||
83,115.0
|
||||
84,140.0
|
||||
85,97.0
|
||||
86,85.0
|
||||
87,61.0
|
||||
88,153.0
|
||||
89,200.0
|
||||
90,200.0
|
||||
91,200.0
|
||||
92,200.0
|
||||
93,64.0
|
||||
94,200.0
|
||||
95,200.0
|
||||
96,157.0
|
||||
97,128.0
|
||||
98,160.0
|
||||
99,35.0
|
||||
100,140.0
|
||||
101,113.0
|
||||
102,200.0
|
||||
103,154.0
|
||||
104,200.0
|
||||
105,200.0
|
||||
106,200.0
|
||||
107,198.0
|
||||
108,137.0
|
||||
109,200.0
|
||||
110,200.0
|
||||
111,102.0
|
||||
112,200.0
|
||||
113,200.0
|
||||
114,200.0
|
||||
115,200.0
|
||||
116,148.0
|
||||
117,200.0
|
||||
118,200.0
|
||||
119,200.0
|
||||
120,200.0
|
||||
121,200.0
|
||||
122,194.0
|
||||
123,200.0
|
||||
124,200.0
|
||||
125,200.0
|
||||
126,183.0
|
||||
127,200.0
|
||||
128,200.0
|
||||
129,200.0
|
||||
130,200.0
|
||||
131,200.0
|
||||
132,200.0
|
||||
133,200.0
|
||||
134,200.0
|
||||
135,200.0
|
||||
136,93.0
|
||||
137,96.0
|
||||
138,84.0
|
||||
139,103.0
|
||||
140,79.0
|
||||
141,104.0
|
||||
142,82.0
|
||||
143,105.0
|
||||
144,200.0
|
||||
145,200.0
|
||||
146,171.0
|
||||
147,200.0
|
||||
148,200.0
|
||||
149,200.0
|
||||
150,200.0
|
||||
151,197.0
|
||||
152,133.0
|
||||
153,142.0
|
||||
154,147.0
|
||||
155,156.0
|
||||
156,131.0
|
||||
157,181.0
|
||||
158,163.0
|
||||
159,146.0
|
||||
160,200.0
|
||||
161,176.0
|
||||
162,200.0
|
||||
163,173.0
|
||||
164,177.0
|
||||
165,200.0
|
||||
166,200.0
|
||||
167,200.0
|
||||
168,200.0
|
||||
169,200.0
|
||||
170,200.0
|
||||
171,200.0
|
||||
172,200.0
|
||||
173,200.0
|
||||
174,200.0
|
||||
175,200.0
|
||||
176,200.0
|
||||
177,200.0
|
||||
178,200.0
|
||||
179,200.0
|
||||
180,200.0
|
||||
181,200.0
|
||||
182,200.0
|
||||
183,200.0
|
||||
184,200.0
|
||||
185,200.0
|
||||
186,200.0
|
||||
187,200.0
|
||||
188,200.0
|
||||
189,200.0
|
||||
190,200.0
|
||||
191,200.0
|
||||
192,200.0
|
||||
193,200.0
|
||||
194,200.0
|
||||
195,200.0
|
||||
196,190.0
|
||||
197,200.0
|
||||
198,189.0
|
||||
199,200.0
|
||||
|
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:27:44
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-02-10 01:25:27
|
||||
LastEditTime: 2022-08-22 17:35:34
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -16,35 +16,27 @@ from torch.distributions import Bernoulli
|
||||
from torch.autograd import Variable
|
||||
import numpy as np
|
||||
|
||||
class MLP(nn.Module):
|
||||
|
||||
''' 多层感知机
|
||||
输入:state维度
|
||||
输出:概率
|
||||
'''
|
||||
def __init__(self,input_dim,hidden_dim = 36):
|
||||
super(MLP, self).__init__()
|
||||
# 24和36为hidden layer的层数,可根据input_dim, n_actions的情况来改变
|
||||
self.fc1 = nn.Linear(input_dim, hidden_dim)
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
||||
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = F.sigmoid(self.fc3(x))
|
||||
return x
|
||||
|
||||
class PolicyGradient:
|
||||
|
||||
def __init__(self, n_states,cfg):
|
||||
def __init__(self, n_states,model,memory,cfg):
|
||||
self.gamma = cfg.gamma
|
||||
self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
|
||||
self.device = torch.device(cfg.device)
|
||||
self.memory = memory
|
||||
self.policy_net = model.to(self.device)
|
||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.batch_size = cfg.batch_size
|
||||
|
||||
def choose_action(self,state):
|
||||
|
||||
def sample_action(self,state):
|
||||
|
||||
state = torch.from_numpy(state).float()
|
||||
state = Variable(state)
|
||||
probs = self.policy_net(state)
|
||||
m = Bernoulli(probs) # 伯努利分布
|
||||
action = m.sample()
|
||||
action = action.data.numpy().astype(int)[0] # 转为标量
|
||||
return action
|
||||
def predict_action(self,state):
|
||||
|
||||
state = torch.from_numpy(state).float()
|
||||
state = Variable(state)
|
||||
probs = self.policy_net(state)
|
||||
@@ -53,7 +45,9 @@ class PolicyGradient:
|
||||
action = action.data.numpy().astype(int)[0] # 转为标量
|
||||
return action
|
||||
|
||||
def update(self,reward_pool,state_pool,action_pool):
|
||||
def update(self):
|
||||
state_pool,action_pool,reward_pool= self.memory.sample()
|
||||
state_pool,action_pool,reward_pool = list(state_pool),list(action_pool),list(reward_pool)
|
||||
# Discount reward
|
||||
running_add = 0
|
||||
for i in reversed(range(len(reward_pool))):
|
||||
@@ -83,7 +77,11 @@ class PolicyGradient:
|
||||
# print(loss)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
def save(self,path):
|
||||
torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pt')
|
||||
def load(self,path):
|
||||
self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pt'))
|
||||
self.memory.clear()
|
||||
def save_model(self,path):
|
||||
from pathlib import Path
|
||||
# create path
|
||||
Path(path).mkdir(parents=True, exist_ok=True)
|
||||
torch.save(self.policy_net.state_dict(), path+'checkpoint.pt')
|
||||
def load_model(self,path):
|
||||
self.policy_net.load_state_dict(torch.load(path+'checkpoint.pt'))
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:21:53
|
||||
LastEditor: John
|
||||
LastEditTime: 2022-07-21 21:44:00
|
||||
LastEditTime: 2022-08-22 17:40:07
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -19,10 +19,11 @@ import torch
|
||||
import datetime
|
||||
import argparse
|
||||
from itertools import count
|
||||
|
||||
import torch.nn.functional as F
|
||||
from pg import PolicyGradient
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards
|
||||
from common.utils import save_results, make_dir,all_seed,save_args,plot_rewards
|
||||
from common.models import MLP
|
||||
from common.memories import PGReplay
|
||||
|
||||
|
||||
def get_args():
|
||||
@@ -32,112 +33,107 @@ def get_args():
|
||||
parser = argparse.ArgumentParser(description="hyperparameters")
|
||||
parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
|
||||
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
|
||||
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training")
|
||||
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
|
||||
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
|
||||
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
|
||||
parser.add_argument('--lr',default=0.01,type=float,help="learning rate")
|
||||
parser.add_argument('--batch_size',default=8,type=int)
|
||||
parser.add_argument('--lr',default=0.005,type=float,help="learning rate")
|
||||
parser.add_argument('--update_fre',default=8,type=int)
|
||||
parser.add_argument('--hidden_dim',default=36,type=int)
|
||||
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
|
||||
parser.add_argument('--seed',default=1,type=int,help="seed")
|
||||
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/results/' )
|
||||
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
|
||||
'/' + curr_time + '/models/' ) # path to save models
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
args = parser.parse_args()
|
||||
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
|
||||
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
|
||||
args = parser.parse_args([])
|
||||
return args
|
||||
|
||||
class PGNet(MLP):
|
||||
''' instead of outputing action, PG Net outputs propabilities of actions, we can use class inheritance from MLP here
|
||||
'''
|
||||
def forward(self, x):
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = F.sigmoid(self.fc3(x))
|
||||
return x
|
||||
|
||||
def env_agent_config(cfg,seed=1):
|
||||
def env_agent_config(cfg):
|
||||
env = gym.make(cfg.env_name)
|
||||
env.seed(seed)
|
||||
if cfg.seed !=0: # set random seed
|
||||
all_seed(env,seed=cfg.seed)
|
||||
n_states = env.observation_space.shape[0]
|
||||
agent = PolicyGradient(n_states,cfg)
|
||||
n_actions = env.action_space.n # action dimension
|
||||
print(f"state dim: {n_states}, action dim: {n_actions}")
|
||||
model = PGNet(n_states,1,hidden_dim=cfg.hidden_dim)
|
||||
memory = PGReplay()
|
||||
agent = PolicyGradient(n_states,model,memory,cfg)
|
||||
return env,agent
|
||||
|
||||
def train(cfg,env,agent):
|
||||
print('Start training!')
|
||||
print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
|
||||
state_pool = [] # temp states pool per several episodes
|
||||
action_pool = []
|
||||
reward_pool = []
|
||||
print(f'Env:{cfg.env_name}, Algo:{cfg.algo_name}, Device:{cfg.device}')
|
||||
rewards = []
|
||||
ma_rewards = []
|
||||
for i_ep in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in count():
|
||||
action = agent.choose_action(state) # 根据当前环境state选择action
|
||||
action = agent.sample_action(state) # sample action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
if done:
|
||||
reward = 0
|
||||
state_pool.append(state)
|
||||
action_pool.append(float(action))
|
||||
reward_pool.append(reward)
|
||||
agent.memory.push((state,float(action),reward))
|
||||
state = next_state
|
||||
if done:
|
||||
print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
|
||||
break
|
||||
if i_ep > 0 and i_ep % cfg.batch_size == 0:
|
||||
agent.update(reward_pool,state_pool,action_pool)
|
||||
state_pool = []
|
||||
action_pool = []
|
||||
reward_pool = []
|
||||
if (i_ep+1) % cfg.update_fre == 0:
|
||||
agent.update()
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(
|
||||
0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('Finish training!')
|
||||
env.close() # close environment
|
||||
return rewards, ma_rewards
|
||||
res_dic = {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
return res_dic
|
||||
|
||||
|
||||
def test(cfg,env,agent):
|
||||
print('开始测试!')
|
||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||
print("start testing!")
|
||||
print(f"Env: {cfg.env_name}, Algo: {cfg.algo_name}, Device: {cfg.device}")
|
||||
rewards = []
|
||||
ma_rewards = []
|
||||
for i_ep in range(cfg.test_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in count():
|
||||
action = agent.choose_action(state) # 根据当前环境state选择action
|
||||
action = agent.predict_action(state)
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
if done:
|
||||
reward = 0
|
||||
state = next_state
|
||||
if done:
|
||||
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
|
||||
print(f'Episode: {i_ep+1}/{cfg.test_eps},Reward: {ep_reward:.2f}')
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(
|
||||
0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
print('完成测试!')
|
||||
print("finish testing!")
|
||||
env.close()
|
||||
return rewards, ma_rewards
|
||||
return {'episodes':range(len(rewards)),'rewards':rewards}
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = Config()
|
||||
# 训练
|
||||
cfg = get_args()
|
||||
env, agent = env_agent_config(cfg)
|
||||
rewards, ma_rewards = train(cfg, env, agent)
|
||||
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||
agent.save(path=cfg.model_path) # 保存模型
|
||||
save_results(rewards, ma_rewards, tag='train',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
|
||||
# 测试
|
||||
env, agent = env_agent_config(cfg)
|
||||
agent.load(path=cfg.model_path) # 导入模型
|
||||
rewards, ma_rewards = test(cfg, env, agent)
|
||||
save_results(rewards, ma_rewards, tag='test',
|
||||
path=cfg.result_path) # 保存结果
|
||||
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果
|
||||
res_dic = train(cfg, env, agent)
|
||||
save_args(cfg,path = cfg.result_path) # save parameters
|
||||
agent.save_model(path = cfg.model_path) # save models
|
||||
save_results(res_dic, tag = 'train', path = cfg.result_path) # save results
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "train") # plot results
|
||||
# testing
|
||||
env, agent = env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
|
||||
agent.load_model(path = cfg.model_path) # load model
|
||||
res_dic = test(cfg, env, agent)
|
||||
save_results(res_dic, tag='test',
|
||||
path = cfg.result_path)
|
||||
plot_rewards(res_dic['rewards'], cfg, path = cfg.result_path,tag = "test")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user