This commit is contained in:
JohnJim0816
2021-03-23 16:10:11 +08:00
parent d4690c2058
commit bf0f2990cf
198 changed files with 1668 additions and 1545 deletions

View File

@@ -1,3 +0,0 @@
{
"python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python"
}

View File

@@ -5,19 +5,18 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-03 20:47:09
LastEditor: John
LastEditTime: 2020-11-08 22:16:29
LastEditTime: 2021-03-20 17:41:21
Discription:
Environment:
'''
from model import ActorCritic
from A2C.model import ActorCritic
import torch.optim as optim
class A2C:
def __init__(self,n_states, n_actions, hidden_dim=256,device="cpu",lr = 3e-4):
self.device = device
def __init__(self,n_states, n_actions, cfg):
self.gamma = 0.99
self.model = ActorCritic(n_states, n_actions, hidden_dim=hidden_dim).to(device)
self.optimizer = optim.Adam(self.model.parameters(),lr=lr)
self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device)
self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr)
def choose_action(self, state):
dist, value = self.model(state)
action = dist.sample()

View File

@@ -5,13 +5,13 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-30 15:39:37
LastEditor: John
LastEditTime: 2020-11-03 20:52:07
LastEditTime: 2021-03-17 20:19:14
Discription:
Environment:
'''
import gym
from common.multiprocessing_env import SubprocVecEnv
from A2C.multiprocessing_env import SubprocVecEnv
# num_envs = 16
# env_name = "Pendulum-v0"

View File

@@ -5,94 +5,73 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2020-11-08 22:19:56
LastEditTime: 2021-03-20 16:58:04
@Discription:
@Environment: python 3.7.9
'''
import sys,os
sys.path.append(os.getcwd()) # add current terminal path
import torch
import gym
import os
import numpy as np
import argparse
from torch.utils.tensorboard import SummaryWriter
from agent import A2C
from env import make_envs
from utils import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH
from utils import save_model,save_results
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # reward 折扣因子
parser.add_argument("--lr", default=3e-4, type=float) # critic学习率
parser.add_argument("--actor_lr", default=1e-4, type=float)
parser.add_argument("--memory_capacity", default=10000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=4000, type=int)
parser.add_argument("--train_steps", default=5, type=int)
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=4, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
def test_env(agent,device='cpu'):
env = gym.make("CartPole-v0")
state = env.reset()
ep_reward=0
for _ in range(200):
state = torch.FloatTensor(state).unsqueeze(0).to(device)
dist, value = agent.model(state)
action = dist.sample()
next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
state = next_state
ep_reward += reward
if done:
break
return ep_reward
import datetime
from A2C.agent import A2C
def train(cfg):
print('Start to train ! \n')
envs = make_envs(num_envs=16,env_name="CartPole-v0")
n_states = envs.observation_space.shape[0]
n_actions = envs.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = A2C(n_states, n_actions, hidden_dim=256)
# moving_average_rewards = []
# ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
state = envs.reset()
for i_episode in range(1, cfg.train_eps+1):
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class A2CConfig:
def __init__(self):
self.gamma = 0.99
self.lr = 3e-4 # learnning rate
self.actor_lr = 1e-4 # learnning rate of actor network
self.memory_capacity = 10000 # capacity of replay memory
self.batch_size = 128
self.train_eps = 200
self.train_steps = 200
self.eval_eps = 200
self.eval_steps = 200
self.target_update = 4
self.hidden_dim=256
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train(cfg,env,agent):
print('Start to train ! ')
for i_episode in range(cfg.train_eps):
state = env.reset()
log_probs = []
values = []
rewards = []
masks = []
entropy = 0
for i_step in range(1, cfg.train_steps+1):
state = torch.FloatTensor(state).to(device)
ep_reward = 0
for i_step in range(cfg.train_steps):
state = torch.FloatTensor(state).to(cfg.device)
dist, value = agent.model(state)
action = dist.sample()
next_state, reward, done, _ = envs.step(action.cpu().numpy())
next_state, reward, done, _ = env.step(action.cpu().numpy())
ep_reward+=reward
state = next_state
log_prob = dist.log_prob(action)
entropy += dist.entropy().mean()
log_probs.append(log_prob)
values.append(value)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
if i_episode%20 == 0:
print("reward",test_env(agent,device='cpu'))
next_state = torch.FloatTensor(next_state).to(device)
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device))
if done:
break
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
next_state = torch.FloatTensor(next_state).to(cfg.device)
_, next_value =agent.model(next_state)
returns = agent.compute_returns(next_value, rewards, masks)
@@ -107,80 +86,17 @@ def train(cfg):
agent.optimizer.zero_grad()
loss.backward()
agent.optimizer.step()
for _ in range(100):
print("test_reward",test_env(agent,device='cpu'))
# print('Episode:', i_episode, ' Reward: %i' %
# int(ep_reward[0]), 'n_steps:', i_step)
# ep_steps.append(i_step)
# rewards.append(ep_reward)
# if i_episode == 1:
# moving_average_rewards.append(ep_reward[0])
# else:
# moving_average_rewards.append(
# 0.9*moving_average_rewards[-1]+0.1*ep_reward[0])
# writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
# writer.add_scalar('steps_of_each_episode',
# ep_steps[-1], i_episode)
writer.close()
print('Complete training')
''' 保存模型 '''
# save_model(agent,model_path=SAVED_MODEL_PATH)
# '''存储reward等相关结果'''
# save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
# def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
# print('start to eval ! \n')
# env = NormalizedActions(gym.make("Pendulum-v0"))
# n_states = env.observation_space.shape[0]
# n_actions = env.action_space.shape[0]
# agent = DDPG(n_states, n_actions, critic_lr=1e-3,
# actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
# agent.load_model(saved_model_path+'checkpoint.pth')
# rewards = []
# moving_average_rewards = []
# ep_steps = []
# log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
# writer = SummaryWriter(log_dir)
# for i_episode in range(1, cfg.eval_eps+1):
# state = env.reset() # reset环境状态
# ep_reward = 0
# for i_step in range(1, cfg.eval_steps+1):
# action = agent.choose_action(state) # 根据当前环境state选择action
# next_state, reward, done, _ = env.step(action) # 更新环境参数
# ep_reward += reward
# state = next_state # 跳转到下一个状态
# if done:
# break
# print('Episode:', i_episode, ' Reward: %i' %
# int(ep_reward), 'n_steps:', i_step, 'done: ', done)
# ep_steps.append(i_step)
# rewards.append(ep_reward)
# # 计算滑动窗口的reward
# if i_episode == 1:
# moving_average_rewards.append(ep_reward)
# else:
# moving_average_rewards.append(
# 0.9*moving_average_rewards[-1]+0.1*ep_reward)
# writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
# writer.add_scalar('steps_of_each_episode',
# ep_steps[-1], i_episode)
# writer.close()
# '''存储reward等相关结果'''
# if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
# os.mkdir(RESULT_PATH)
# np.save(RESULT_PATH+'rewards_eval.npy', rewards)
# np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards)
# np.save(RESULT_PATH+'steps_eval.npy', ep_steps)
if __name__ == "__main__":
cfg = get_args()
train(cfg)
# cfg = get_args()
# if cfg.train:
# train(cfg)
# eval(cfg)
# else:
# model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
# eval(cfg,saved_model_path=model_path)
cfg = A2CConfig()
env = gym.make('CartPole-v0')
env.seed(1) # set random seed for env
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = A2C(n_states, n_actions, cfg)
train(cfg,env,agent)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-03 20:45:25
LastEditor: John
LastEditTime: 2020-11-07 18:49:09
LastEditTime: 2021-03-20 17:41:33
Discription:
Environment:
'''
@@ -13,7 +13,7 @@ import torch.nn as nn
from torch.distributions import Categorical
class ActorCritic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256, std=0.0):
def __init__(self, n_states, n_actions, hidden_dim=256):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(n_states, hidden_dim),
@@ -30,6 +30,7 @@ class ActorCritic(nn.Module):
def forward(self, x):
value = self.critic(x)
print(x)
probs = self.actor(x)
dist = Categorical(probs)
return dist, value

162
codes/A2C/test.py Normal file
View File

@@ -0,0 +1,162 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-20 17:43:17
LastEditor: John
LastEditTime: 2021-03-20 19:36:24
Discription:
Environment:
'''
import sys
import torch
import gym
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd
learning_rate = 3e-4
# Constants
GAMMA = 0.99
class A2CConfig:
''' hyperparameters
'''
def __init__(self):
self.gamma = 0.99
self.lr = 3e-4 # learnning rate
self.actor_lr = 1e-4 # learnning rate of actor network
self.memory_capacity = 10000 # capacity of replay memory
self.batch_size = 128
self.train_eps = 3000
self.train_steps = 200
self.eval_eps = 200
self.eval_steps = 200
self.target_update = 4
self.hidden_dim=256
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ActorCritic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, learning_rate=3e-4):
super(ActorCritic, self).__init__()
self.n_actions = n_actions
self.critic_linear1 = nn.Linear(n_states, hidden_dim)
self.critic_linear2 = nn.Linear(hidden_dim, 1)
self.actor_linear1 = nn.Linear(n_states, hidden_dim)
self.actor_linear2 = nn.Linear(hidden_dim, n_actions)
def forward(self, state):
state = Variable(torch.from_numpy(state).float().unsqueeze(0))
value = F.relu(self.critic_linear1(state))
value = self.critic_linear2(value)
policy_dist = F.relu(self.actor_linear1(state))
policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1)
return value, policy_dist
class A2C:
def __init__(self,n_states,n_actions,cfg):
self.model = ActorCritic(n_states, n_actions, cfg.hidden_dim)
self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr)
def choose_action(self,state):
pass
def update(self):
pass
def train(cfg,env,agent):
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
actor_critic = ActorCritic(n_states, n_actions, hidden_dim)
ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate)
all_lengths = []
average_lengths = []
all_rewards = []
entropy_term = 0
for episode in range(cfg.train_eps):
log_probs = []
values = []
rewards = []
state = env.reset()
for steps in range(cfg.train_steps):
value, policy_dist = actor_critic.forward(state)
value = value.detach().numpy()[0,0]
dist = policy_dist.detach().numpy()
action = np.random.choice(n_actions, p=np.squeeze(dist))
log_prob = torch.log(policy_dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist) * np.log(dist))
new_state, reward, done, _ = env.step(action)
rewards.append(reward)
values.append(value)
log_probs.append(log_prob)
entropy_term += entropy
state = new_state
if done or steps == cfg.train_steps-1:
Qval, _ = actor_critic.forward(new_state)
Qval = Qval.detach().numpy()[0,0]
all_rewards.append(np.sum(rewards))
all_lengths.append(steps)
average_lengths.append(np.mean(all_lengths[-10:]))
if episode % 10 == 0:
sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
break
# compute Q values
Qvals = np.zeros_like(values)
for t in reversed(range(len(rewards))):
Qval = rewards[t] + GAMMA * Qval
Qvals[t] = Qval
#update actor critic
values = torch.FloatTensor(values)
Qvals = torch.FloatTensor(Qvals)
log_probs = torch.stack(log_probs)
advantage = Qvals - values
actor_loss = (-log_probs * advantage).mean()
critic_loss = 0.5 * advantage.pow(2).mean()
ac_loss = actor_loss + critic_loss + 0.001 * entropy_term
ac_optimizer.zero_grad()
ac_loss.backward()
ac_optimizer.step()
# Plot results
smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean()
smoothed_rewards = [elem for elem in smoothed_rewards]
plt.plot(all_rewards)
plt.plot(smoothed_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()
plt.plot(all_lengths)
plt.plot(average_lengths)
plt.xlabel('Episode')
plt.ylabel('Episode length')
plt.show()
if __name__ == "__main__":
cfg = A2CConfig
env = gym.make("CartPole-v0")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = A2C(n_states,n_actions,cfg)
train(cfg,env,agent)

View File

@@ -15,7 +15,7 @@ import datetime
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/'
def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH):

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2021-03-13 14:56:50
LastEditTime: 2021-03-17 20:35:37
@Discription:
@Environment: python 3.7.7
'''
@@ -68,7 +68,7 @@ def train(cfg,env,agent):
# 更新target network复制DQN中的所有weights and biases
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step,done))
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
ep_steps.append(i_step)
rewards.append(ep_reward)
# 计算滑动窗口的reward

View File

@@ -1,33 +0,0 @@
## 思路
见[博客](https://blog.csdn.net/JohnJim0/article/details/111552545)
## 环境
python 3.7.9
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
## 使用
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
可视化
```python
tensorboard --logdir logs
```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
LastEditTime: 2020-12-22 16:20:35
LastEditTime: 2021-03-13 15:01:27
@Discription:
@Environment: python 3.7.7
'''
@@ -20,65 +20,51 @@ import torch.nn.functional as F
import random
import math
import numpy as np
from memory import ReplayBuffer
from model import FCN
class DQN:
def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"):
self.actions_count = 0
from common.memory import ReplayBuffer
from common.model import MLP2
class DoubleDQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = device # 设备cpu或gpu等
self.gamma = gamma
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma
# e-greedy策略相关参数
self.epsilon = 0
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.batch_size = batch_size
self.policy_net = FCN(n_states, n_actions).to(self.device)
self.target_net = FCN(n_states, n_actions).to(self.device)
self.actions_count = 0
self.epsilon_start = cfg.epsilon_start
self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size
self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net的初始模型参数完全复制policy_net
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
# 可查parameters()与state_dict()的区别前者require_grad=True
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
self.loss = 0
self.memory = ReplayBuffer(memory_capacity)
self.memory = ReplayBuffer(cfg.memory_capacity)
def choose_action(self, state, train=True):
def choose_action(self, state):
'''选择动作
'''
if train:
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
else:
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device='cpu', dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.target_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
return action
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size:
@@ -86,8 +72,7 @@ class DQN:
# 从memory中随机采样transition
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
# 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])
### 转为张量 ###
state_batch = torch.tensor(
state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
@@ -96,6 +81,7 @@ class DQN:
reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(
next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(
done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
@@ -112,7 +98,7 @@ class DQN:
# 对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
'''
'''以下是Double DQNq_target计算方式与NatureDQN稍有不同'''
'''以下是Double DQN q_target计算方式与NatureDQN稍有不同'''
next_target_values = self.target_net(
next_state_batch)
# 选出Q(s_t, a)对应的action代入到next_target_values获得target net对应的next_q_value即Q(s_t|a=argmax Q(s_t, a))
@@ -127,8 +113,8 @@ class DQN:
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型
def save_model(self,path):
torch.save(self.target_net.state_dict(), path)
def save(self,path):
torch.save(self.target_net.state_dict(), path+'DoubleDQN_checkpoint.pth')
def load_model(self,path):
self.target_net.load_state_dict(torch.load(path))
def load(self,path):
self.target_net.load_state_dict(torch.load(path+'DoubleDQN_checkpoint.pth'))

View File

@@ -5,37 +5,58 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
LastEditTime: 2020-12-22 15:39:46
LastEditTime: 2021-03-17 20:11:19
@Discription:
@Environment: python 3.7.7
'''
import sys,os
sys.path.append(os.getcwd()) # add current terminal path
import gym
import torch
from torch.utils.tensorboard import SummaryWriter
import os
from agent import DQN
from params import SEQUENCE,SAVED_MODEL_PATH,RESULT_PATH
from params import get_args
from utils import save_results
import datetime
from DoubleDQN.agent import DoubleDQN
from common.plot import plot_rewards
from common.utils import save_results
def train(cfg):
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class DoubleDQNConfig:
def __init__(self):
self.algo = "Double DQN" # 算法名称
self.gamma = 0.99
self.epsilon_start = 0.9 # e-greedy策略的初始epsilon
self.epsilon_end = 0.01
self.epsilon_decay = 200
self.lr = 0.01 # 学习率
self.memory_capacity = 10000 # Replay Memory容量
self.batch_size = 128
self.train_eps = 250 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 2 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
self.hidden_dim = 128 # 神经网络隐藏层维度
def train(cfg,env,agent):
print('Start to train !')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
rewards = []
moving_average_rewards = []
rewards,ma_rewards = [],[]
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.train_eps+1):
for i_episode in range(cfg.train_eps):
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.train_steps+1):
for i_step in range(cfg.train_steps):
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
@@ -47,80 +68,26 @@ def train(cfg):
# 更新target network复制DQN中的所有weights and biases
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon)
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step,done))
ep_steps.append(i_step)
rewards.append(ep_reward)
# 计算滑动窗口的reward
if i_episode == 1:
moving_average_rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
ma_rewards.append(ep_reward)
print('Complete training')
''' 保存模型 '''
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
print('model saved')
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
return rewards,ma_rewards
def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
print('start to eval !')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
if __name__ == "__main__":
cfg = DoubleDQNConfig()
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
agent.load_model(saved_model_path+'checkpoint.pth')
rewards = []
moving_average_rewards = []
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.eval_eps+1):
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.eval_steps+1):
action = agent.choose_action(state,train=False) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
state = next_state # 跳转到下一个状态
if done:
break
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step, 'done: ', done)
ep_steps.append(i_step)
rewards.append(ep_reward)
# 计算滑动窗口的reward
if i_episode == 1:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,ep_steps,tag='eval',result_path=RESULT_PATH)
print('Complete evaling')
if __name__ == "__main__":
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)
agent = DoubleDQN(n_states,n_actions,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)

View File

@@ -5,12 +5,11 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
LastEditTime: 2020-12-22 12:56:27
LastEditTime: 2021-01-20 18:58:37
@Discription:
@Environment: python 3.7.7
'''
import random
import numpy as np
class ReplayBuffer:

View File

@@ -12,13 +12,13 @@ LastEditTime: 2020-08-19 16:55:54
import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
class MLP(nn.Module):
def __init__(self, n_states=4, n_actions=18):
""" 初始化q网络为全连接网络
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
"""
super(FCN, self).__init__()
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) # 输出层

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-12-22 15:22:17
LastEditor: John
LastEditTime: 2020-12-22 15:26:09
LastEditTime: 2021-01-21 14:30:38
Discription:
Environment:
'''
@@ -16,7 +16,10 @@ import argparse
ALGO_NAME = 'Double DQN'
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/'
TRAIN_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
EVAL_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
def get_args():
'''模型参数

View File

@@ -24,14 +24,14 @@ def plot(item,ylabel='rewards_train', save_fig = True):
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.savefig(os.path.dirname(__file__)+"/results/"+ylabel+".png")
plt.show()
# plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
output_path = os.path.split(os.path.abspath(__file__))[0]+"/results/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

View File

@@ -13,7 +13,7 @@ import os
import numpy as np
def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./result'):
def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./results'):
if not os.path.exists(result_path): # 检测是否存在文件夹
os.mkdir(result_path)
np.save(result_path+'rewards_'+tag+'.npy', rewards)

21
codes/LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 John Jim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -2,10 +2,10 @@
## 环境说明
见[环境说明](https://github.com/datawhalechina/leedeeprl-notes/blob/master/codes/env_info.md)中的The Racetrack
见[环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)中的The Racetrack
## First-Visit MC 介绍
伪代码
### 伪代码
![mc_control_algo](assets/mc_control_algo.png)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-12 16:14:34
LastEditor: John
LastEditTime: 2021-03-12 16:15:12
LastEditTime: 2021-03-17 12:35:06
Discription:
Environment:
'''
@@ -26,11 +26,13 @@ class FisrtVisitMC:
def choose_action(self,state):
''' e-greed policy '''
best_action = np.argmax(self.Q[state])
# action = best_action
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
if state in self.Q.keys():
best_action = np.argmax(self.Q[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
else:
action = np.random.randint(0,self.n_actions)
return action
def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition

Binary file not shown.

Before

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-11 14:26:44
LastEditor: John
LastEditTime: 2021-03-12 16:15:46
LastEditTime: 2021-03-17 12:35:36
Discription:
Environment:
'''
@@ -35,7 +35,7 @@ class MCConfig:
def __init__(self):
self.epsilon = 0.15 # epsilon: The probability to select a random action .
self.gamma = 0.9 # gamma: Gamma discount factor.
self.n_episodes = 300
self.n_episodes = 150
self.n_steps = 2000
def get_mc_args():
@@ -58,8 +58,8 @@ def mc_train(cfg,env,agent):
one_ep_transition = []
state = env.reset()
ep_reward = 0
# while True:
for t in range(cfg.n_steps):
while True:
# for t in range(cfg.n_steps):
action = agent.choose_action(state)
next_state, reward, done = env.step(action)
ep_reward+=reward

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

View File

@@ -1,38 +1,15 @@
# Policy Gradient
实现的是Policy Gradient最基本的REINFORCE方法
## 使用说明
直接运行```main.py```即可
## 原理讲解
参考我的博客[Policy Gradient算法实战](https://blog.csdn.net/JohnJim0/article/details/110236851)
## 环境
python 3.7.9
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
python 3.7.9、pytorch 1.6.0
## 程序运行方法
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
tensorboard
```python
tensorboard --logdir logs
```
## 参考
[REINFORCE和Reparameterization Trick](https://blog.csdn.net/JohnJim0/article/details/110230703)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44
LastEditor: John
LastEditTime: 2020-11-23 17:04:37
LastEditTime: 2021-03-13 11:50:16
Discription:
Environment:
'''
@@ -14,24 +14,23 @@ from torch.distributions import Bernoulli
from torch.autograd import Variable
import numpy as np
from model import FCN
from common.model import MLP1
class PolicyGradient:
def __init__(self, state_dim,device='cpu',gamma = 0.99,lr = 0.01,batch_size=5):
self.gamma = gamma
self.policy_net = FCN(state_dim)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=lr)
self.batch_size = batch_size
def __init__(self, n_states,cfg):
self.gamma = cfg.gamma
self.policy_net = MLP1(n_states,hidden_dim=cfg.hidden_dim)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.batch_size = cfg.batch_size
def choose_action(self,state):
state = torch.from_numpy(state).float()
state = Variable(state)
probs = self.policy_net(state)
m = Bernoulli(probs)
m = Bernoulli(probs) # 伯努利分布
action = m.sample()
action = action.data.numpy().astype(int)[0] # 转为标量
return action
@@ -67,6 +66,6 @@ class PolicyGradient:
loss.backward()
self.optimizer.step()
def save_model(self,path):
torch.save(self.policy_net.state_dict(), path)
torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pth')
def load_model(self,path):
self.policy_net.load_state_dict(torch.load(path))
self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pth'))

View File

@@ -1,19 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:23:10
LastEditor: John
LastEditTime: 2020-11-23 11:55:24
Discription:
Environment:
'''
import gym
def env_init():
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
state_dim = env.observation_space.shape[0]
n_actions = env.action_space.n
return env,state_dim,n_actions

View File

@@ -5,34 +5,47 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2020-11-24 19:52:40
LastEditTime: 2021-03-13 11:50:32
Discription:
Environment:
'''
import sys,os
sys.path.append(os.getcwd()) # 添加当前终端路径
from itertools import count
import torch
import os
from torch.utils.tensorboard import SummaryWriter
import datetime
import gym
from PolicyGradient.agent import PolicyGradient
from common.plot import plot_rewards
from common.utils import save_results
from env import env_init
from params import get_args
from agent import PolicyGradient
from params import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH
from utils import save_results,save_model
from plot import plot
def train(cfg):
env,state_dim,n_actions = env_init()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
agent = PolicyGradient(state_dim,device = device,lr = cfg.policy_lr)
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
class PGConfig:
def __init__(self):
self.train_eps = 300 # 训练的episode数目
self.batch_size = 8
self.lr = 0.01 # 学习率
self.gamma = 0.99
self.hidden_dim = 36 # 隐藏层维度
def train(cfg,env,agent):
'''下面带pool都是存放的transition序列用于gradient'''
state_pool = [] # 存放每batch_size个episode的state序列
action_pool = []
reward_pool = []
''' 存储每个episode的reward用于绘图'''
rewards = []
moving_average_rewards = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir) # 使用tensorboard的writer
ma_rewards = []
for i_episode in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
@@ -55,55 +68,22 @@ def train(cfg):
action_pool = []
reward_pool = []
rewards.append(ep_reward)
if i_episode == 0:
moving_average_rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode+1)
writer.close()
print('Complete training')
save_model(agent,model_path=SAVED_MODEL_PATH)
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,tag='train',result_path=RESULT_PATH)
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards_train')
def eval(cfg,saved_model_path = SAVED_MODEL_PATH):
env,state_dim,n_actions = env_init()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
agent = PolicyGradient(state_dim,device = device,lr = cfg.policy_lr)
agent.load_model(saved_model_path+'checkpoint.pth')
rewards = []
moving_average_rewards = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir) # 使用tensorboard的writer
for i_episode in range(cfg.eval_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
if done:
print('Episode:', i_episode, ' Reward:', ep_reward)
break
rewards.append(ep_reward)
if i_episode == 0:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode+1)
writer.close()
print('Complete evaling')
ma_rewards.append(ep_reward)
print('complete training')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)
cfg = PGConfig()
env = gym.make('CartPole-v0') # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = PolicyGradient(n_states,cfg)
rewards, ma_rewards = train(cfg,env,agent)
agent.save_model(SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = "Policy Gradient",path=RESULT_PATH)

View File

@@ -1,27 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:18:46
LastEditor: John
LastEditTime: 2020-11-27 16:55:25
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
''' 全连接网络'''
def __init__(self,state_dim):
super(FCN, self).__init__()
# 24和36为hidden layer的层数可根据state_dim, n_actions的情况来改变
self.fc1 = nn.Linear(state_dim, 36)
self.fc2 = nn.Linear(36, 36)
self.fc3 = nn.Linear(36, 1) # Prob of Left
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x

View File

@@ -1,29 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:25:37
LastEditor: John
LastEditTime: 2020-11-26 19:11:21
Discription: 存储参数
Environment:
'''
import argparse
import datetime
import os
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def get_args():
'''训练参数'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--train_eps", default=300, type=int) # 训练的最大episode数目
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
parser.add_argument("--batch_size", default=4, type=int) # 用于gradient的episode数目
parser.add_argument("--policy_lr", default=0.01, type=float) # 学习率
config = parser.parse_args()
return config

View File

@@ -1,46 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-23 13:48:46
LastEditor: John
LastEditTime: 2020-11-23 13:48:48
Discription:
Environment:
'''
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
def plot(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DQN')
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards,ylabel='rewards_'+tag)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)

Some files were not shown because too many files have changed in this diff Show More