update
This commit is contained in:
@@ -1,38 +1,15 @@
|
||||
# Policy Gradient
|
||||
实现的是Policy Gradient最基本的REINFORCE方法
|
||||
## 使用说明
|
||||
直接运行```main.py```即可
|
||||
## 原理讲解
|
||||
|
||||
参考我的博客[Policy Gradient算法实战](https://blog.csdn.net/JohnJim0/article/details/110236851)
|
||||
|
||||
## 环境
|
||||
|
||||
python 3.7.9
|
||||
|
||||
pytorch 1.6.0
|
||||
|
||||
tensorboard 2.3.0
|
||||
|
||||
torchvision 0.7.0
|
||||
|
||||
python 3.7.9、pytorch 1.6.0
|
||||
## 程序运行方法
|
||||
|
||||
train:
|
||||
|
||||
```python
|
||||
python main.py
|
||||
```
|
||||
|
||||
eval:
|
||||
|
||||
```python
|
||||
python main.py --train 0
|
||||
```
|
||||
tensorboard:
|
||||
```python
|
||||
tensorboard --logdir logs
|
||||
```
|
||||
|
||||
|
||||
## 参考
|
||||
|
||||
[REINFORCE和Reparameterization Trick](https://blog.csdn.net/JohnJim0/article/details/110230703)
|
||||
|
||||
@@ -5,7 +5,7 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:27:44
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-23 17:04:37
|
||||
LastEditTime: 2021-03-13 11:50:16
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
@@ -14,24 +14,23 @@ from torch.distributions import Bernoulli
|
||||
from torch.autograd import Variable
|
||||
import numpy as np
|
||||
|
||||
from model import FCN
|
||||
from common.model import MLP1
|
||||
|
||||
class PolicyGradient:
|
||||
|
||||
def __init__(self, state_dim,device='cpu',gamma = 0.99,lr = 0.01,batch_size=5):
|
||||
self.gamma = gamma
|
||||
self.policy_net = FCN(state_dim)
|
||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=lr)
|
||||
self.batch_size = batch_size
|
||||
def __init__(self, n_states,cfg):
|
||||
self.gamma = cfg.gamma
|
||||
self.policy_net = MLP1(n_states,hidden_dim=cfg.hidden_dim)
|
||||
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
|
||||
self.batch_size = cfg.batch_size
|
||||
|
||||
def choose_action(self,state):
|
||||
|
||||
state = torch.from_numpy(state).float()
|
||||
state = Variable(state)
|
||||
probs = self.policy_net(state)
|
||||
m = Bernoulli(probs)
|
||||
m = Bernoulli(probs) # 伯努利分布
|
||||
action = m.sample()
|
||||
|
||||
action = action.data.numpy().astype(int)[0] # 转为标量
|
||||
return action
|
||||
|
||||
@@ -67,6 +66,6 @@ class PolicyGradient:
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
def save_model(self,path):
|
||||
torch.save(self.policy_net.state_dict(), path)
|
||||
torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pth')
|
||||
def load_model(self,path):
|
||||
self.policy_net.load_state_dict(torch.load(path))
|
||||
self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pth'))
|
||||
@@ -1,19 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:23:10
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-23 11:55:24
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import gym
|
||||
|
||||
def env_init():
|
||||
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要
|
||||
env.seed(1) # 设置env随机种子
|
||||
state_dim = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
return env,state_dim,n_actions
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -5,34 +5,47 @@ Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:21:53
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-24 19:52:40
|
||||
LastEditTime: 2021-03-13 11:50:32
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import sys,os
|
||||
sys.path.append(os.getcwd()) # 添加当前终端路径
|
||||
from itertools import count
|
||||
import torch
|
||||
import os
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
import datetime
|
||||
import gym
|
||||
from PolicyGradient.agent import PolicyGradient
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results
|
||||
|
||||
from env import env_init
|
||||
from params import get_args
|
||||
from agent import PolicyGradient
|
||||
from params import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH
|
||||
from utils import save_results,save_model
|
||||
from plot import plot
|
||||
def train(cfg):
|
||||
env,state_dim,n_actions = env_init()
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
|
||||
agent = PolicyGradient(state_dim,device = device,lr = cfg.policy_lr)
|
||||
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
|
||||
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
|
||||
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
|
||||
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
|
||||
os.mkdir(SAVED_MODEL_PATH)
|
||||
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
|
||||
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
|
||||
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
|
||||
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
|
||||
os.mkdir(RESULT_PATH)
|
||||
|
||||
class PGConfig:
|
||||
def __init__(self):
|
||||
self.train_eps = 300 # 训练的episode数目
|
||||
self.batch_size = 8
|
||||
self.lr = 0.01 # 学习率
|
||||
self.gamma = 0.99
|
||||
self.hidden_dim = 36 # 隐藏层维度
|
||||
|
||||
def train(cfg,env,agent):
|
||||
'''下面带pool都是存放的transition序列用于gradient'''
|
||||
state_pool = [] # 存放每batch_size个episode的state序列
|
||||
action_pool = []
|
||||
reward_pool = []
|
||||
''' 存储每个episode的reward用于绘图'''
|
||||
rewards = []
|
||||
moving_average_rewards = []
|
||||
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
|
||||
writer = SummaryWriter(log_dir) # 使用tensorboard的writer
|
||||
ma_rewards = []
|
||||
for i_episode in range(cfg.train_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
@@ -55,55 +68,22 @@ def train(cfg):
|
||||
action_pool = []
|
||||
reward_pool = []
|
||||
rewards.append(ep_reward)
|
||||
if i_episode == 0:
|
||||
moving_average_rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(
|
||||
0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
moving_average_rewards.append(
|
||||
0.9*moving_average_rewards[-1]+0.1*ep_reward)
|
||||
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode+1)
|
||||
writer.close()
|
||||
print('Complete training!')
|
||||
save_model(agent,model_path=SAVED_MODEL_PATH)
|
||||
'''存储reward等相关结果'''
|
||||
save_results(rewards,moving_average_rewards,tag='train',result_path=RESULT_PATH)
|
||||
plot(rewards)
|
||||
plot(moving_average_rewards,ylabel='moving_average_rewards_train')
|
||||
|
||||
def eval(cfg,saved_model_path = SAVED_MODEL_PATH):
|
||||
env,state_dim,n_actions = env_init()
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
|
||||
agent = PolicyGradient(state_dim,device = device,lr = cfg.policy_lr)
|
||||
agent.load_model(saved_model_path+'checkpoint.pth')
|
||||
rewards = []
|
||||
moving_average_rewards = []
|
||||
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
|
||||
writer = SummaryWriter(log_dir) # 使用tensorboard的writer
|
||||
for i_episode in range(cfg.eval_eps):
|
||||
state = env.reset()
|
||||
ep_reward = 0
|
||||
for _ in count():
|
||||
action = agent.choose_action(state) # 根据当前环境state选择action
|
||||
next_state, reward, done, _ = env.step(action)
|
||||
ep_reward += reward
|
||||
state = next_state
|
||||
if done:
|
||||
print('Episode:', i_episode, ' Reward:', ep_reward)
|
||||
break
|
||||
rewards.append(ep_reward)
|
||||
if i_episode == 0:
|
||||
moving_average_rewards.append(ep_reward)
|
||||
else:
|
||||
moving_average_rewards.append(
|
||||
0.9*moving_average_rewards[-1]+0.1*ep_reward)
|
||||
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode+1)
|
||||
writer.close()
|
||||
print('Complete evaling!')
|
||||
|
||||
ma_rewards.append(ep_reward)
|
||||
print('complete training!')
|
||||
return rewards, ma_rewards
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = get_args()
|
||||
if cfg.train:
|
||||
train(cfg)
|
||||
eval(cfg)
|
||||
else:
|
||||
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
|
||||
eval(cfg,saved_model_path=model_path)
|
||||
cfg = PGConfig()
|
||||
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要
|
||||
env.seed(1) # 设置env随机种子
|
||||
n_states = env.observation_space.shape[0]
|
||||
n_actions = env.action_space.n
|
||||
agent = PolicyGradient(n_states,cfg)
|
||||
rewards, ma_rewards = train(cfg,env,agent)
|
||||
agent.save_model(SAVED_MODEL_PATH)
|
||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",algo = "Policy Gradient",path=RESULT_PATH)
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:18:46
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-27 16:55:25
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
class FCN(nn.Module):
|
||||
''' 全连接网络'''
|
||||
def __init__(self,state_dim):
|
||||
super(FCN, self).__init__()
|
||||
# 24和36为hidden layer的层数,可根据state_dim, n_actions的情况来改变
|
||||
self.fc1 = nn.Linear(state_dim, 36)
|
||||
self.fc2 = nn.Linear(36, 36)
|
||||
self.fc3 = nn.Linear(36, 1) # Prob of Left
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = F.sigmoid(self.fc3(x))
|
||||
return x
|
||||
@@ -1,29 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-22 23:25:37
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-26 19:11:21
|
||||
Discription: 存储参数
|
||||
Environment:
|
||||
'''
|
||||
import argparse
|
||||
import datetime
|
||||
import os
|
||||
|
||||
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
|
||||
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
|
||||
|
||||
def get_args():
|
||||
'''训练参数'''
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval
|
||||
parser.add_argument("--train_eps", default=300, type=int) # 训练的最大episode数目
|
||||
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
|
||||
parser.add_argument("--batch_size", default=4, type=int) # 用于gradient的episode数目
|
||||
parser.add_argument("--policy_lr", default=0.01, type=float) # 学习率
|
||||
config = parser.parse_args()
|
||||
return config
|
||||
@@ -1,46 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-23 13:48:46
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-23 13:48:48
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
def plot(item,ylabel='rewards_train', save_fig = True):
|
||||
'''plot using searborn to plot
|
||||
'''
|
||||
sns.set()
|
||||
plt.figure()
|
||||
plt.plot(np.arange(len(item)), item)
|
||||
plt.title(ylabel+' of DQN')
|
||||
plt.ylabel(ylabel)
|
||||
plt.xlabel('episodes')
|
||||
if save_fig:
|
||||
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
|
||||
plt.show()
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
|
||||
tag = 'train'
|
||||
rewards=np.load(output_path+"rewards_"+tag+".npy", )
|
||||
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
|
||||
steps=np.load(output_path+"steps_"+tag+".npy")
|
||||
plot(rewards)
|
||||
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
|
||||
plot(steps,ylabel='steps_'+tag)
|
||||
tag = 'eval'
|
||||
rewards=np.load(output_path+"rewards_"+tag+".npy", )
|
||||
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
|
||||
steps=np.load(output_path+"steps_"+tag+".npy")
|
||||
plot(rewards,ylabel='rewards_'+tag)
|
||||
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
|
||||
plot(steps,ylabel='steps_'+tag)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 40 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 59 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 57 KiB |
BIN
codes/PolicyGradient/results/20210313-114904/rewards_train.npy
Normal file
BIN
codes/PolicyGradient/results/20210313-114904/rewards_train.npy
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,29 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
Author: John
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2020-11-23 13:44:52
|
||||
LastEditor: John
|
||||
LastEditTime: 2020-11-23 13:45:42
|
||||
Discription:
|
||||
Environment:
|
||||
'''
|
||||
import os
|
||||
import numpy as np
|
||||
|
||||
|
||||
def save_results(rewards,moving_average_rewards,tag='train',result_path='./result'):
|
||||
'''保存reward等结果
|
||||
'''
|
||||
if not os.path.exists(result_path): # 检测是否存在文件夹
|
||||
os.mkdir(result_path)
|
||||
np.save(result_path+'rewards_'+tag+'.npy', rewards)
|
||||
np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards)
|
||||
print('results saved!')
|
||||
|
||||
def save_model(agent,model_path='./saved_model'):
|
||||
if not os.path.exists(model_path): # 检测是否存在文件夹
|
||||
os.mkdir(model_path)
|
||||
agent.save_model(model_path+'checkpoint.pth')
|
||||
print('model saved!')
|
||||
Reference in New Issue
Block a user