Merge branch 'master' of github.com:datawhalechina/easy-rl

This commit is contained in:
qiwang067
2022-06-01 23:08:03 +08:00
149 changed files with 1867 additions and 1542 deletions

View File

@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
class A2C: class A2C:
''' A2C算法 ''' A2C算法
''' '''
def __init__(self,state_dim,action_dim,cfg) -> None: def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.device = cfg.device self.device = cfg.device
self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters()) self.optimizer = optim.Adam(self.model.parameters())
def compute_returns(self,next_value, rewards, masks): def compute_returns(self,next_value, rewards, masks):

View File

@@ -10,7 +10,7 @@ import torch
import torch.optim as optim import torch.optim as optim
import datetime import datetime
from common.multiprocessing_env import SubprocVecEnv from common.multiprocessing_env import SubprocVecEnv
from A2C.agent import ActorCritic from a2c import ActorCritic
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards from common.utils import plot_rewards
@@ -74,9 +74,9 @@ def train(cfg,envs):
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
env = gym.make(cfg.env_name) # a single env env = gym.make(cfg.env_name) # a single env
env.seed(10) env.seed(10)
state_dim = envs.observation_space.shape[0] n_states = envs.observation_space.shape[0]
action_dim = envs.action_space.n n_actions = envs.action_space.n
model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
optimizer = optim.Adam(model.parameters()) optimizer = optim.Adam(model.parameters())
frame_idx = 0 frame_idx = 0
test_rewards = [] test_rewards = []

View File

@@ -39,11 +39,11 @@ class ReplayBuffer:
''' '''
return len(self.buffer) return len(self.buffer)
class Actor(nn.Module): class Actor(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Actor, self).__init__() super(Actor, self).__init__()
self.linear1 = nn.Linear(state_dim, hidden_dim) self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, action_dim) self.linear3 = nn.Linear(hidden_dim, n_actions)
self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -54,10 +54,10 @@ class Actor(nn.Module):
x = torch.tanh(self.linear3(x)) x = torch.tanh(self.linear3(x))
return x return x
class Critic(nn.Module): class Critic(nn.Module):
def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Critic, self).__init__() super(Critic, self).__init__()
self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1) self.linear3 = nn.Linear(hidden_dim, 1)
# 随机初始化为较小的值 # 随机初始化为较小的值
@@ -72,12 +72,12 @@ class Critic(nn.Module):
x = self.linear3(x) x = self.linear3(x)
return x return x
class DDPG: class DDPG:
def __init__(self, state_dim, action_dim, cfg): def __init__(self, n_states, n_actions, cfg):
self.device = cfg.device self.device = cfg.device
self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
# 复制参数到目标网络 # 复制参数到目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):

View File

@@ -39,15 +39,15 @@ class OUNoise(object):
self.max_sigma = max_sigma self.max_sigma = max_sigma
self.min_sigma = min_sigma self.min_sigma = min_sigma
self.decay_period = decay_period self.decay_period = decay_period
self.action_dim = action_space.shape[0] self.n_actions = action_space.shape[0]
self.low = action_space.low self.low = action_space.low
self.high = action_space.high self.high = action_space.high
self.reset() self.reset()
def reset(self): def reset(self):
self.obs = np.ones(self.action_dim) * self.mu self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self): def evolve_obs(self):
x = self.obs x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx self.obs = x + dx
return self.obs return self.obs
def get_action(self, action, t=0): def get_action(self, action, t=0):

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21 @Date: 2020-06-11 20:58:21
@LastEditor: John @LastEditor: John
LastEditTime: 2021-09-16 01:31:33 LastEditTime: 2022-02-10 06:23:27
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -18,23 +18,29 @@ import datetime
import gym import gym
import torch import torch
from DDPG.env import NormalizedActions from env import NormalizedActions,OUNoise
from DDPG.agent import DDPG from ddpg import DDPG
from DDPG.train import train,test from DDPG.train import train,test
from common.utils import save_results,make_dir from common.utils import save_results,make_dir
from common.utils import plot_rewards from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DDPG' # 算法名称 class Config:
env_name = 'Pendulum-v1' # 环境名称gym新版本约0.21.0之后中Pendulum-v0改为Pendulum-v1 '''超参数
'''
class DDPGConfig:
def __init__(self): def __init__(self):
self.algo_name = algo_name # 算法名称 ################################## 环境超参数 ###################################
self.env_name = env_name # 环境名称 self.algo_name = 'DDPG' # 算法名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.env_name = 'Pendulum-v1' # 环境名称gym新版本约0.21.0之后中Pendulum-v0改为Pendulum-v1
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 300 # 训练的回合数 self.train_eps = 300 # 训练的回合数
self.test_eps = 50 # 测试的回合数 self.test_eps = 50 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 折扣因子 self.gamma = 0.99 # 折扣因子
self.critic_lr = 1e-3 # 评论家网络的学习率 self.critic_lr = 1e-3 # 评论家网络的学习率
self.actor_lr = 1e-4 # 演员网络的学习率 self.actor_lr = 1e-4 # 演员网络的学习率
@@ -43,39 +49,92 @@ class DDPGConfig:
self.target_update = 2 # 目标网络的更新频率 self.target_update = 2 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层维度 self.hidden_dim = 256 # 网络隐藏层维度
self.soft_tau = 1e-2 # 软更新参数 self.soft_tau = 1e-2 # 软更新参数
################################################################################
class PlotConfig: ################################# 保存结果相关参数 ################################
def __init__(self) -> None: self.result_path = curr_path + "/outputs/" + self.env_name + \
self.algo_name = algo_name # 算法名称 '/' + curr_time + '/results/' # 保存结果的路径
self.env_name = env_name # 环境名称 self.model_path = curr_path + "/outputs/" + self.env_name + \
self.result_path = curr_path+"/outputs/" + self.env_name + \ '/' + curr_time + '/models/' # 保存模型的路径
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片 self.save = True # 是否保存图片
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU ################################################################################
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
env.seed(seed) # 随机种子 env.seed(seed) # 随机种子
state_dim = env.observation_space.shape[0] n_states = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] n_actions = env.action_space.shape[0]
agent = DDPG(state_dim,action_dim,cfg) agent = DDPG(n_states,n_actions,cfg)
return env,agent return env,agent
def train(cfg, env, agent):
print('开始训练!')
print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(action, i_step)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
cfg = DDPGConfig() def test(cfg, env, agent):
plot_cfg = PlotConfig() print('开始测试!')
# 训练 print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
env,agent = env_agent_config(cfg,seed=1) rewards = [] # 记录所有回合的奖励
rewards, ma_rewards = train(cfg, env, agent) ma_rewards = [] # 记录所有回合的滑动平均奖励
make_dir(plot_cfg.result_path, plot_cfg.model_path) for i_ep in range(cfg.test_eps):
agent.save(path=plot_cfg.model_path) state = env.reset()
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) done = False
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 ep_reward = 0
# 测试 i_step = 0
env,agent = env_agent_config(cfg,seed=10) while not done:
agent.load(path=plot_cfg.model_path) i_step += 1
rewards,ma_rewards = test(plot_cfg,env,agent) action = agent.choose_action(state)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path) next_state, reward, done, _ = env.step(action)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 ep_reward += reward
state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

View File

@@ -1,64 +0,0 @@
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
from DDPG.env import OUNoise
def train(cfg, env, agent):
print('开始训练!')
print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(action, i_step)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards, ma_rewards

View File

@@ -50,15 +50,15 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
class FCN(nn.Module): class FCN(nn.Module):
def __init__(self, state_dim=4, action_dim=18): def __init__(self, n_states=4, n_actions=18):
""" 初始化q网络为全连接网络 """ 初始化q网络为全连接网络
state_dim: 输入的feature即环境的state数目 n_states: 输入的feature即环境的state数目
action_dim: 输出的action总个数 n_actions: 输出的action总个数
""" """
super(FCN, self).__init__() super(FCN, self).__init__()
self.fc1 = nn.Linear(state_dim, 128) # 输入层 self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层 self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, action_dim) # 输出层 self.fc3 = nn.Linear(128, n_actions) # 输出层
def forward(self, x): def forward(self, x):
# 各层对应的激活函数 # 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
x = F.relu(self.fc2(x)) x = F.relu(self.fc2(x))
return self.fc3(x) return self.fc3(x)
``` ```
输入为state_dim输出为action_dim包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。 输入为n_states,输出为n_actions包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
### Replay Buffer ### Replay Buffer
@@ -107,8 +107,8 @@ class ReplayBuffer:
在类中建立两个网络以及optimizer和memory 在类中建立两个网络以及optimizer和memory
```python ```python
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
target_param.data.copy_(param.data) target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
if random.random() > self.epsilon(self.frame_idx): if random.random() > self.epsilon(self.frame_idx):
action = self.predict(state) action = self.predict(state)
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.n_actions)
return action return action
``` ```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2021-12-22 14:01:37 LastEditTime: 2022-03-02 11:05:11
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -20,22 +20,7 @@ import random
import math import math
import numpy as np import numpy as np
class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128):
""" 初始化q网络为全连接网络
state_dim: 输入的特征数即环境的状态维度
action_dim: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class ReplayBuffer: class ReplayBuffer:
def __init__(self, capacity): def __init__(self, capacity):
@@ -62,9 +47,9 @@ class ReplayBuffer:
return len(self.buffer) return len(self.buffer)
class DQN: class DQN:
def __init__(self, state_dim, action_dim, cfg): def __init__(self, n_actions,model,cfg):
self.action_dim = action_dim # 总的动作个数 self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子 self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数 # e-greedy策略相关参数
@@ -73,8 +58,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \ (cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay) math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.policy_net = model.to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.target_net = model.to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data) target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -86,23 +71,24 @@ class DQN:
self.frame_idx += 1 self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx): if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad(): with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32) state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
q_values = self.policy_net(state) q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作 action = q_values.max(1)[1].item() # 选择Q值最大的动作
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.n_actions)
return action return action
def update(self): def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略 if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition) # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
# print('updating')
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size) self.batch_size)
# 转为张量 state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device) done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a) q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值 next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值

View File

@@ -70,9 +70,9 @@ class ReplayBuffer:
return len(self.buffer) return len(self.buffer)
class DQN: class DQN:
def __init__(self, state_dim, action_dim, cfg): def __init__(self, n_states, n_actions, cfg):
self.action_dim = action_dim # 总的动作个数 self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子 self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数 # e-greedy策略相关参数
@@ -81,8 +81,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \ (cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay) math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = CNN(state_dim, action_dim).to(self.device) self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(state_dim, action_dim).to(self.device) self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data) target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -94,11 +94,12 @@ class DQN:
self.frame_idx += 1 self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx): if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad(): with torch.no_grad():
print(type(state))
state = torch.tensor([state], device=self.device, dtype=torch.float32) state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state) q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作 action = q_values.max(1)[1].item() # 选择Q值最大的动作
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.n_actions)
return action return action
def update(self): def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略 if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略

142
codes/DQN/dqn_cnn2.py Normal file
View File

@@ -0,0 +1,142 @@
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import random
import math
import numpy as np
class CNN(nn.Module):
def __init__(self, n_frames, n_actions):
super(CNN,self).__init__()
self.n_frames = n_frames
self.n_actions = n_actions
# Layers
self.conv1 = nn.Conv2d(
in_channels=n_frames,
out_channels=16,
kernel_size=8,
stride=4,
padding=2
)
self.conv2 = nn.Conv2d(
in_channels=16,
out_channels=32,
kernel_size=4,
stride=2,
padding=1
)
self.fc1 = nn.Linear(
in_features=3200,
out_features=256,
)
self.fc2 = nn.Linear(
in_features=256,
out_features=n_actions,
)
# Activation Functions
self.relu = nn.ReLU()
def flatten(self, x):
batch_size = x.size()[0]
x = x.view(batch_size, -1)
return x
def forward(self, x):
# Forward pass
x = self.relu(self.conv1(x)) # In: (80, 80, 4) Out: (20, 20, 16)
x = self.relu(self.conv2(x)) # In: (20, 20, 16) Out: (10, 10, 32)
x = self.flatten(x) # In: (10, 10, 32) Out: (3200,)
x = self.relu(self.fc1(x)) # In: (3200,) Out: (256,)
x = self.fc2(x) # In: (256,) Out: (4,)
return x
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + \
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
def choose_action(self, state):
''' 选择动作
'''
self.frame_idx += 1
if random.random() > self.epsilon(self.frame_idx):
with torch.no_grad():
state = torch.tensor([state], device=self.device, dtype=torch.float32)
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略
return
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
self.batch_size)
# 转为张量
state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)
reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)
next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch), device=self.device)
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
# 计算期望的Q值对于终止状态此时done_batch[0]=1, 对应的expected_q_value等于reward
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
# 优化更新模型
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def save(self, path):
torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
def load(self, path):
self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
param.data.copy_(target_param.data)

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -1,5 +1,7 @@
import sys import sys
import os import os
import torch.nn as nn
import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径 parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径 sys.path.append(parent_path) # 添加路径到系统路径
@@ -8,26 +10,42 @@ import gym
import torch import torch
import datetime import datetime
import numpy as np import numpy as np
from common.utils import save_results, make_dir from common.utils import save_results_1, make_dir
from common.utils import plot_rewards from common.utils import plot_rewards
from DQN.dqn import DQN from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class Config: class Config:
'''超参数 '''超参数
''' '''
def __init__(self): def __init__(self):
################################## 环境超参数 ################################### ############################### hyperparameters ################################
self.algo_name = 'DQN' # 算法名称 self.algo_name = 'DQN' # algorithm name
self.env_name = 'CartPole-v0' # 环境名称 self.env_name = 'CartPole-v0' # environment name
self.device = torch.device( self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 "cuda" if torch.cuda.is_available() else "cpu") # check GPU
self.seed = 10 # 随机种子置0则不设置随机种子 self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数 self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数 self.test_eps = 20 # 测试的回合数
################################################################################ ################################################################################
################################## 算法超参数 ################################### ################################## 算法超参数 ###################################
@@ -41,8 +59,8 @@ class Config:
self.target_update = 4 # 目标网络的更新频率 self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层 self.hidden_dim = 256 # 网络隐藏层
################################################################################ ################################################################################
################################# 保存结果相关参数 ############################## ################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \ self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径 '/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \ self.model_path = curr_path + "/outputs/" + self.env_name + \
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
''' 创建环境和智能体 ''' 创建环境和智能体
''' '''
env = gym.make(cfg.env_name) # 创建环境 env = gym.make(cfg.env_name) # 创建环境
state_dim = env.observation_space.shape[0] # 状态维度 n_states = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体 print(f"n states: {n_states}, n actions: {n_actions}")
model = MLP(n_states,n_actions)
agent = DQN(n_actions, model, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子 if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed) torch.manual_seed(cfg.seed)
env.seed(cfg.seed) env.seed(cfg.seed)
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.train_eps): for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励 ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态 state = env.reset() # 重置环境,返回初始状态
while True: while True:
ep_step += 1
action = agent.choose_action(state) # 选择动作 action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, agent.memory.push(state, action, reward,
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
break break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新 if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict()) agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward) rewards.append(ep_reward)
if ma_rewards: if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
if (i_ep + 1) % 10 == 0: if (i_ep + 1) % 1 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward)) print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!') print('Finish training!')
env.close() env.close()
return rewards, ma_rewards res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
return res_dic
def test(cfg, env, agent): def test(cfg, env, agent):
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
################################################################################ ################################################################################
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.test_eps): for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励 ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态 state = env.reset() # 重置环境,返回初始状态
while True: while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作 action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态 state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励 ep_reward += reward # 累加奖励
if done: if done:
break break
steps.append(ep_step)
rewards.append(ep_reward) rewards.append(ep_reward)
if ma_rewards: if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f"回合{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
print('完成测试!') print('完成测试!')
env.close() env.close()
return rewards, ma_rewards return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
if __name__ == "__main__": if __name__ == "__main__":
cfg = Config() cfg = Config()
# 训练 # 训练
env, agent = env_agent_config(cfg) env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent) res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型 agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train', save_results_1(res_dic, tag='train',
path=cfg.result_path) # 保存结果 path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
# 测试 # 测试
env, agent = env_agent_config(cfg) env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型 agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent) res_dic = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test', save_results_1(res_dic, tag='test',
path=cfg.result_path) # 保存结果 path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@ Author: JiangJi
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17 Date: 2021-12-22 11:14:17
LastEditor: JiangJi LastEditor: JiangJi
LastEditTime: 2021-12-22 11:40:44 LastEditTime: 2022-02-10 06:17:41
Discription: 使用 Nature DQN 训练 CartPole-v1 Discription: 使用 Nature DQN 训练 CartPole-v1
''' '''
import sys import sys
@@ -19,7 +19,7 @@ import torch
import datetime import datetime
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn from common.utils import plot_rewards, plot_rewards_cn
from DQN.dqn import DQN from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = "DQN" # 算法名称 algo_name = "DQN" # 算法名称
@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
''' '''
env = gym.make(cfg.env_name) # 创建环境 env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子 env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态维度 n_states = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体 agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent return env, agent
def train(cfg, env, agent): def train(cfg, env, agent):

View File

@@ -5,7 +5,7 @@ Author: JiangJi
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17 Date: 2021-12-22 11:14:17
LastEditor: JiangJi LastEditor: JiangJi
LastEditTime: 2021-12-22 15:27:48 LastEditTime: 2022-02-10 06:17:46
Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4 Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4
''' '''
import sys import sys
@@ -20,7 +20,7 @@ import datetime
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn from common.utils import plot_rewards, plot_rewards_cn
from common.atari_wrappers import make_atari, wrap_deepmind from common.atari_wrappers import make_atari, wrap_deepmind
from DQN.dqn import DQN from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DQN-cnn' # 算法名称 algo_name = 'DQN-cnn' # 算法名称
@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
# env = wrap_deepmind(env) # env = wrap_deepmind(env)
# env = wrap_pytorch(env) # env = wrap_pytorch(env)
env.seed(seed) # 设置随机种子 env.seed(seed) # 设置随机种子
state_dim = env.observation_space.shape[0] # 状态维度 n_states = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体 agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent return env, agent
def train(cfg, env, agent): def train(cfg, env, agent):

180
codes/DQN/task4.py Normal file
View File

@@ -0,0 +1,180 @@
import sys
import os
import torch.nn as nn
import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results_1, make_dir
from common.utils import plot_rewards
from dqn_1 import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=256):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
return self.fc4(x)
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
# self.env_name = 'Breakout-ram-v0' # 环境名称
self.env_name = 'ALE/Pong-ram-v5'
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 5 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500000 # e-greedy策略中epsilon的衰减率
self.lr = 0.00025 # 学习率
self.memory_capacity = int(5e4) # 经验回放的容量
self.batch_size = 32 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
model = MLP(n_states,n_actions)
agent = DQN(n_states, n_actions, model, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
ep_step = 0
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!')
env.close()
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
return res_dic
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
env.close()
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results_1(res_dic, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results_1(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果

149
codes/DQN/task5.py Normal file
View File

@@ -0,0 +1,149 @@
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
self.env_name = 'SpaceInvaders-ram-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 20000 # e-greedy策略中epsilon的衰减率
self.lr = 2e-4 # 学习率
self.memory_capacity = int(1e5) # 经验回放的容量
self.batch_size = 32 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
agent = DQN(n_states, n_actions, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!')
env.close()
return rewards, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

184
codes/DQN/test copy.py Normal file
View File

@@ -0,0 +1,184 @@
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import gym
import time
from collections import deque
from tensorflow.keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
import matplotlib.pyplot as plt
class DQN:
def __init__(self, env):
self.env = env
self.memory = deque(maxlen=400000)
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = self.epsilon_min / 500000
self.batch_size = 32
self.train_start = 1000
self.state_size = self.env.observation_space.shape[0]*4
self.action_size = self.env.action_space.n
self.learning_rate = 0.00025
self.evaluation_model = self.create_model()
self.target_model = self.create_model()
def create_model(self):
model = Sequential()
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
model.add(Dense(128*2, activation='relu'))
model.add(Dense(128*2, activation='relu'))
model.add(Dense(self.env.action_space.n, activation='linear'))
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
return model
def choose_action(self, state, steps):
if steps > 50000:
if self.epsilon > self.epsilon_min:
self.epsilon -= self.epsilon_decay
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
return np.argmax(self.evaluation_model.predict(state)[0])
def remember(self, cur_state, action, reward, new_state, done):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = (cur_state, action, reward, new_state, done)
self.memory.extend([transition])
self.memory_counter += 1
def replay(self):
if len(self.memory) < self.train_start:
return
mini_batch = random.sample(self.memory, self.batch_size)
update_input = np.zeros((self.batch_size, self.state_size))
update_target = np.zeros((self.batch_size, self.action_size))
for i in range(self.batch_size):
state, action, reward, new_state, done = mini_batch[i]
target = self.evaluation_model.predict(state)[0]
if done:
target[action] = reward
else:
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
update_input[i] = state
update_target[i] = target
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
def target_train(self):
self.target_model.set_weights(self.evaluation_model.get_weights())
return
def visualize(self, reward, episode):
plt.plot(episode, reward, 'ob-')
plt.title('Average reward each 100 episode')
plt.ylabel('Reward')
plt.xlabel('Episodes')
plt.grid()
plt.show()
def transform(self,state):
if state.shape[1]==512:
return state
a=[np.binary_repr(x,width=8) for x in state[0]]
res=[]
for x in a:
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
res=[int(x,2) for x in res]
return np.array(res)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def main():
# env = gym.make('Breakout-ram-v0')
env = gym.make('Breakout-ram-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space.shape[0])
print(env.observation_space.high)
print(env.observation_space.low)
#print(env.observation_space.shape)
episodes = 5000
trial_len = 10000
tmp_reward=0
sum_rewards = 0
n_success = 0
total_steps = 0
graph_reward = []
graph_episodes = []
time_record = []
dqn_agent = DQN(env=env)
for i_episode in range(episodes):
start_time = time.time()
total_reward = 0
cur_state = env.reset().reshape(1,128)
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
i_step=0
for step in range(trial_len):
#env.render()
i_step+=1
action = dqn_agent.choose_action(cur_state, total_steps)
new_state, reward, done, _ = env.step(action)
new_state = new_state.reshape(1, 128)
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
total_reward += reward
sum_rewards += reward
tmp_reward += reward
if reward>0: #Testing whether it is good.
reward=1
dqn_agent.remember(cur_state, action, reward, new_state, done)
if total_steps > 10000:
if total_steps%4 == 0:
dqn_agent.replay()
if total_steps%5000 == 0:
dqn_agent.target_train()
cur_state = new_state
total_steps += 1
if done:
env.reset()
break
if (i_episode+1) % 100 == 0:
graph_reward.append(sum_rewards/100)
graph_episodes.append(i_episode+1)
sum_rewards = 0
print("Episode ",i_episode+1," Reward: ")
print(graph_reward[-1])
end_time = time.time()
time_record.append(end_time-start_time)
print("NOW in episode: " + str(i_episode))
print("Time cost: " + str(end_time-start_time))
print("Reward: ",tmp_reward)
print("Step:", i_step)
tmp_reward=0
print("Reward: ")
print(graph_reward)
print("Episode: ")
print(graph_episodes)
print("Average_time: ")
print(sum(time_record)/5000)
dqn_agent.visualize(graph_reward, graph_episodes)
if __name__ == '__main__':
main()

View File

@@ -90,15 +90,15 @@ class OUNoise(object):
self.max_sigma = max_sigma self.max_sigma = max_sigma
self.min_sigma = min_sigma self.min_sigma = min_sigma
self.decay_period = decay_period self.decay_period = decay_period
self.action_dim = action_space.shape[0] self.n_actions = action_space.shape[0]
self.low = action_space.low self.low = action_space.low
self.high = action_space.high self.high = action_space.high
self.reset() self.reset()
def reset(self): def reset(self):
self.obs = np.ones(self.action_dim) * self.mu self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self): def evolve_obs(self):
x = self.obs x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx self.obs = x + dx
return self.obs return self.obs
def get_action(self, action, t=0): def get_action(self, action, t=0):

View File

@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境如下图它通过向左(动作=0
import gym import gym
env = gym.make('CartPole-v0') # 建立环境 env = gym.make('CartPole-v0') # 建立环境
env.seed(1) # 随机种子 env.seed(1) # 随机种子
state_dim = env.observation_space.shape[0] # 状态维度 n_states = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
state = env.reset() # 初始化环境 state = env.reset() # 初始化环境
print(f"状态维度:{state_dim},动作维度:{action_dim}") print(f"状态维度:{n_states},动作维度:{n_actions}")
print(f"初始状态:{state}") print(f"初始状态:{state}")
``` ```
@@ -157,7 +157,7 @@ def choose_action(self, state):
q_values = self.policy_net(state) q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作 action = q_values.max(1)[1].item() # 选择Q值最大的动作
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.n_actions)
``` ```
可以看到跟Q学习算法其实是一样的都是用的$\epsilon-greedy$策略只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。 可以看到跟Q学习算法其实是一样的都是用的$\epsilon-greedy$策略只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。

View File

@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
这里我们在程序中使用了一个装饰器重新定义环境但不影响对环境的理解感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可然后我们可以查看环境的状态和动作维度目 这里我们在程序中使用了一个装饰器重新定义环境但不影响对环境的理解感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可然后我们可以查看环境的状态和动作维度目
```python ```python
state_dim = env.observation_space.n # 状态维度 n_states = env.observation_space.n # 状态维度
action_dim = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
print(f"状态维度:{state_dim},动作维度:{action_dim}") print(f"状态维度:{n_states},动作维度:{n_actions}")
``` ```
打印出来的结果如下: 打印出来的结果如下:
@@ -72,9 +72,9 @@ print(state)
env = gym.make('CliffWalking-v0') # 定义环境 env = gym.make('CliffWalking-v0') # 定义环境
env = CliffWalkingWapper(env) # 装饰环境 env = CliffWalkingWapper(env) # 装饰环境
env.seed(1) # 设置随机种子 env.seed(1) # 设置随机种子
state_dim = env.observation_space.n # 状态维度 n_states = env.observation_space.n # 状态维度
action_dim = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数 agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
ep_reward = 0 # 记录每个回合的奖励 ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境 state = env.reset() # 重置环境
@@ -126,7 +126,7 @@ def choose_action(self, state):
if np.random.uniform(0, 1) > self.epsilon: if np.random.uniform(0, 1) > self.epsilon:
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作 action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
else: else:
action = np.random.choice(self.action_dim) # 随机选择动作 action = np.random.choice(self.n_actions) # 随机选择动作
return action return action
``` ```

View File

@@ -46,15 +46,15 @@ class ReplayBuffer:
return len(self.buffer) return len(self.buffer)
class MLP(nn.Module): class MLP(nn.Module):
def __init__(self, state_dim,action_dim,hidden_dim=128): def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络 """ 初始化q网络为全连接网络
state_dim: 输入的特征数即环境的状态维度 n_states: 输入的特征数即环境的状态维度
action_dim: 输出的动作维度 n_actions: 输出的动作维度
""" """
super(MLP, self).__init__() super(MLP, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x): def forward(self, x):
# 各层对应的激活函数 # 各层对应的激活函数
@@ -63,8 +63,8 @@ class MLP(nn.Module):
return self.fc3(x) return self.fc3(x)
class DoubleDQN: class DoubleDQN:
def __init__(self, state_dim, action_dim, cfg): def __init__(self, n_states, n_actions, cfg):
self.action_dim = action_dim # 总的动作个数 self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma self.gamma = cfg.gamma
# e-greedy策略相关参数 # e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DoubleDQN:
self.epsilon_end = cfg.epsilon_end self.epsilon_end = cfg.epsilon_end
self.epsilon_decay = cfg.epsilon_decay self.epsilon_decay = cfg.epsilon_decay
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
# target_net copy from policy_net # target_net copy from policy_net
for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
target_param.data.copy_(param.data) target_param.data.copy_(param.data)
@@ -103,7 +103,7 @@ class DoubleDQN:
# 所以tensor.max(1)[1]返回最大值对应的下标即action # 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item() action = q_value.max(1)[1].item()
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.n_actions)
return action return action
def update(self): def update(self):

View File

@@ -59,9 +59,9 @@ class Config:
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name) env = gym.make(cfg.env_name)
env.seed(seed) env.seed(seed)
state_dim = env.observation_space.shape[0] n_states = env.observation_space.shape[0]
action_dim = env.action_space.n n_actions = env.action_space.n
agent = DoubleDQN(state_dim,action_dim,cfg) agent = DoubleDQN(n_states,n_actions,cfg)
return env,agent return env,agent
def train(cfg,env,agent): def train(cfg,env,agent):

View File

@@ -136,12 +136,12 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"class DuelingNet(nn.Module):\n", "class DuelingNet(nn.Module):\n",
" def __init__(self, state_dim, action_dim,hidden_size=128):\n", " def __init__(self, n_states, n_actions,hidden_size=128):\n",
" super(DuelingNet, self).__init__()\n", " super(DuelingNet, self).__init__()\n",
" \n", " \n",
" # 隐藏层\n", " # 隐藏层\n",
" self.hidden = nn.Sequential(\n", " self.hidden = nn.Sequential(\n",
" nn.Linear(state_dim, hidden_size),\n", " nn.Linear(n_states, hidden_size),\n",
" nn.ReLU()\n", " nn.ReLU()\n",
" )\n", " )\n",
" \n", " \n",
@@ -149,7 +149,7 @@
" self.advantage = nn.Sequential(\n", " self.advantage = nn.Sequential(\n",
" nn.Linear(hidden_size, hidden_size),\n", " nn.Linear(hidden_size, hidden_size),\n",
" nn.ReLU(),\n", " nn.ReLU(),\n",
" nn.Linear(hidden_size, action_dim)\n", " nn.Linear(hidden_size, n_actions)\n",
" )\n", " )\n",
" \n", " \n",
" # 价值函数\n", " # 价值函数\n",
@@ -192,7 +192,7 @@
], ],
"source": [ "source": [
"class DuelingDQN:\n", "class DuelingDQN:\n",
" def __init__(self,state_dim,action_dim,cfg) -> None:\n", " def __init__(self,n_states,n_actions,cfg) -> None:\n",
" self.batch_size = cfg.batch_size\n", " self.batch_size = cfg.batch_size\n",
" self.device = cfg.device\n", " self.device = cfg.device\n",
" self.loss_history = [] # 记录loss的变化\n", " self.loss_history = [] # 记录loss的变化\n",
@@ -200,8 +200,8 @@
" self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
" (cfg.epsilon_start - cfg.epsilon_end) * \\\n", " (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
" math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
" self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", " self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", " self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
" for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n", " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
" target_param.data.copy_(param.data)\n", " target_param.data.copy_(param.data)\n",
" self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
@@ -214,7 +214,7 @@
" q_values = self.policy_net(state)\n", " q_values = self.policy_net(state)\n",
" action = q_values.max(1)[1].item() # 选择Q值最大的动作\n", " action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
" else:\n", " else:\n",
" action = random.randrange(self.action_dim)\n", " action = random.randrange(self.n_actions)\n",
" return action\n", " return action\n",
" def update(self):\n", " def update(self):\n",
" if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略\n", " if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略\n",

View File

@@ -57,16 +57,16 @@ class MLP(nn.Module):
return self.fc3(x) return self.fc3(x)
class HierarchicalDQN: class HierarchicalDQN:
def __init__(self,state_dim,action_dim,cfg): def __init__(self,n_states,n_actions,cfg):
self.state_dim = state_dim self.n_states = n_states
self.action_dim = action_dim self.n_actions = n_actions
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.device = cfg.device self.device = cfg.device
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.frame_idx = 0 # 用于epsilon的衰减计数 self.frame_idx = 0 # 用于epsilon的衰减计数
self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay) self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) self.policy_net = MLP(2*n_states, n_actions,cfg.hidden_dim).to(self.device)
self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device) self.meta_policy_net = MLP(n_states, n_states,cfg.hidden_dim).to(self.device)
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr) self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr) self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
self.memory = ReplayBuffer(cfg.memory_capacity) self.memory = ReplayBuffer(cfg.memory_capacity)
@@ -76,7 +76,7 @@ class HierarchicalDQN:
self.losses = [] self.losses = []
self.meta_losses = [] self.meta_losses = []
def to_onehot(self,x): def to_onehot(self,x):
oh = np.zeros(self.state_dim) oh = np.zeros(self.n_states)
oh[x - 1] = 1. oh[x - 1] = 1.
return oh return oh
def set_goal(self,state): def set_goal(self,state):
@@ -85,7 +85,7 @@ class HierarchicalDQN:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0) state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
goal = self.meta_policy_net(state).max(1)[1].item() goal = self.meta_policy_net(state).max(1)[1].item()
else: else:
goal = random.randrange(self.state_dim) goal = random.randrange(self.n_states)
return goal return goal
def choose_action(self,state): def choose_action(self,state):
self.frame_idx += 1 self.frame_idx += 1
@@ -95,7 +95,7 @@ class HierarchicalDQN:
q_value = self.policy_net(state) q_value = self.policy_net(state)
action = q_value.max(1)[1].item() action = q_value.max(1)[1].item()
else: else:
action = random.randrange(self.action_dim) action = random.randrange(self.n_actions)
return action return action
def update(self): def update(self):
self.update_policy() self.update_policy()

View File

@@ -63,9 +63,9 @@ class PlotConfig:
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name) env = gym.make(cfg.env_name)
env.seed(seed) env.seed(seed)
state_dim = env.observation_space.shape[0] n_states = env.observation_space.shape[0]
action_dim = env.action_space.n n_actions = env.action_space.n
agent = HierarchicalDQN(state_dim,action_dim,cfg) agent = HierarchicalDQN(n_states,n_actions,cfg)
return env,agent return env,agent
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,21 +0,0 @@
MIT License
Copyright (c) 2020 John Jim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -17,11 +17,11 @@ import dill
class FisrtVisitMC: class FisrtVisitMC:
''' On-Policy First-Visit MC Control ''' On-Policy First-Visit MC Control
''' '''
def __init__(self,action_dim,cfg): def __init__(self,n_actions,cfg):
self.action_dim = action_dim self.n_actions = n_actions
self.epsilon = cfg.epsilon self.epsilon = cfg.epsilon
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.Q_table = defaultdict(lambda: np.zeros(action_dim)) self.Q_table = defaultdict(lambda: np.zeros(n_actions))
self.returns_sum = defaultdict(float) # sum of returns self.returns_sum = defaultdict(float) # sum of returns
self.returns_count = defaultdict(float) self.returns_count = defaultdict(float)
@@ -29,11 +29,11 @@ class FisrtVisitMC:
''' e-greed policy ''' ''' e-greed policy '''
if state in self.Q_table.keys(): if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state]) best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon) action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
else: else:
action = np.random.randint(0,self.action_dim) action = np.random.randint(0,self.n_actions)
return action return action
def update(self,one_ep_transition): def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition # Find all (state, action) pairs we've visited in this one_ep_transition

View File

@@ -43,8 +43,8 @@ class MCConfig:
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = RacetrackEnv() env = RacetrackEnv()
action_dim = 9 n_actions = 9
agent = FisrtVisitMC(action_dim, cfg) agent = FisrtVisitMC(n_actions, cfg)
return env,agent return env,agent
def train(cfg, env, agent): def train(cfg, env, agent):

View File

@@ -57,16 +57,16 @@ model就是actor和critic两个网络了
import torch.nn as nn import torch.nn as nn
from torch.distributions.categorical import Categorical from torch.distributions.categorical import Categorical
class Actor(nn.Module): class Actor(nn.Module):
def __init__(self,state_dim, action_dim, def __init__(self,n_states, n_actions,
hidden_dim=256): hidden_dim=256):
super(Actor, self).__init__() super(Actor, self).__init__()
self.actor = nn.Sequential( self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim), nn.Linear(n_states, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, action_dim), nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=-1) nn.Softmax(dim=-1)
) )
def forward(self, state): def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
return dist return dist
class Critic(nn.Module): class Critic(nn.Module):
def __init__(self, state_dim,hidden_dim=256): def __init__(self, n_states,hidden_dim=256):
super(Critic, self).__init__() super(Critic, self).__init__()
self.critic = nn.Sequential( self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim), nn.Linear(n_states, hidden_dim),
nn.ReLU(), nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim), nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(), nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
value = self.critic(state) value = self.critic(state)
return value return value
``` ```
这里Actor就是得到一个概率分布(Categorica也可以是别的分布可以搜索torch distributionsl)critc根据当前状态得到一个值这里的输入维度可以是```state_dim+action_dim```即将action信息也纳入critic网络中这样会更好一些感兴趣的小伙伴可以试试。 这里Actor就是得到一个概率分布(Categorica也可以是别的分布可以搜索torch distributionsl)critc根据当前状态得到一个值这里的输入维度可以是```n_states+n_actions```即将action信息也纳入critic网络中这样会更好一些感兴趣的小伙伴可以试试。
### PPO update ### PPO update
定义一个update函数主要实现伪代码中的第六步和第七步 定义一个update函数主要实现伪代码中的第六步和第七步

View File

@@ -1,44 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:30:46
LastEditor: John
LastEditTime: 2021-09-26 22:00:07
Discription:
Environment:
'''
import numpy as np
class PPOMemory:
def __init__(self, batch_size):
self.states = []
self.probs = []
self.vals = []
self.actions = []
self.rewards = []
self.dones = []
self.batch_size = batch_size
def sample(self):
batch_step = np.arange(0, len(self.states), self.batch_size)
indices = np.arange(len(self.states), dtype=np.int64)
np.random.shuffle(indices)
batches = [indices[i:i+self.batch_size] for i in batch_step]
return np.array(self.states),np.array(self.actions),np.array(self.probs),\
np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
def push(self, state, action, probs, vals, reward, done):
self.states.append(state)
self.actions.append(action)
self.probs.append(probs)
self.vals.append(vals)
self.rewards.append(reward)
self.dones.append(done)
def clear(self):
self.states = []
self.probs = []
self.actions = []
self.rewards = []
self.dones = []
self.vals = []

View File

@@ -1,44 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 15:29:24
LastEditor: John
LastEditTime: 2021-04-08 22:36:43
Discription:
Environment:
'''
import torch.nn as nn
from torch.distributions.categorical import Categorical
class Actor(nn.Module):
def __init__(self,state_dim, action_dim,
hidden_dim):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, action_dim),
nn.Softmax(dim=-1)
)
def forward(self, state):
dist = self.actor(state)
dist = Categorical(dist)
return dist
class Critic(nn.Module):
def __init__(self, state_dim,hidden_dim):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(state_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
def forward(self, state):
value = self.critic(state)
return value

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-23 15:17:42 Date: 2021-03-23 15:17:42
LastEditor: John LastEditor: John
LastEditTime: 2021-09-26 22:02:00 LastEditTime: 2021-12-31 19:38:33
Discription: Discription:
Environment: Environment:
''' '''
@@ -13,25 +13,89 @@ import os
import numpy as np import numpy as np
import torch import torch
import torch.optim as optim import torch.optim as optim
from PPO.model import Actor,Critic import torch.nn as nn
from PPO.memory import PPOMemory from torch.distributions.categorical import Categorical
class PPOMemory:
def __init__(self, batch_size):
self.states = []
self.probs = []
self.vals = []
self.actions = []
self.rewards = []
self.dones = []
self.batch_size = batch_size
def sample(self):
batch_step = np.arange(0, len(self.states), self.batch_size)
indices = np.arange(len(self.states), dtype=np.int64)
np.random.shuffle(indices)
batches = [indices[i:i+self.batch_size] for i in batch_step]
return np.array(self.states),np.array(self.actions),np.array(self.probs),\
np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
def push(self, state, action, probs, vals, reward, done):
self.states.append(state)
self.actions.append(action)
self.probs.append(probs)
self.vals.append(vals)
self.rewards.append(reward)
self.dones.append(done)
def clear(self):
self.states = []
self.probs = []
self.actions = []
self.rewards = []
self.dones = []
self.vals = []
class Actor(nn.Module):
def __init__(self,n_states, n_actions,
hidden_dim):
super(Actor, self).__init__()
self.actor = nn.Sequential(
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, n_actions),
nn.Softmax(dim=-1)
)
def forward(self, state):
dist = self.actor(state)
dist = Categorical(dist)
return dist
class Critic(nn.Module):
def __init__(self, n_states,hidden_dim):
super(Critic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(n_states, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
def forward(self, state):
value = self.critic(state)
return value
class PPO: class PPO:
def __init__(self, state_dim, action_dim,cfg): def __init__(self, n_states, n_actions,cfg):
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.continuous = cfg.continuous self.continuous = cfg.continuous
self.policy_clip = cfg.policy_clip self.policy_clip = cfg.policy_clip
self.n_epochs = cfg.n_epochs self.n_epochs = cfg.n_epochs
self.gae_lambda = cfg.gae_lambda self.gae_lambda = cfg.gae_lambda
self.device = cfg.device self.device = cfg.device
self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device) self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device) self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
self.memory = PPOMemory(cfg.batch_size) self.memory = PPOMemory(cfg.batch_size)
self.loss = 0 self.loss = 0
def choose_action(self, state): def choose_action(self, state):
state = torch.tensor([state], dtype=torch.float).to(self.device) state = np.array([state]) # 先转成数组再转tensor更高效
state = torch.tensor(state, dtype=torch.float).to(self.device)
dist = self.actor(state) dist = self.actor(state)
value = self.critic(state) value = self.critic(state)
action = dist.sample() action = dist.sample()

View File

@@ -5,63 +5,127 @@ sys.path.append(parent_path) # 添加路径到系统路径
import gym import gym
import torch import torch
import numpy as np
import datetime import datetime
from common.plot import plot_rewards from common.utils import plot_rewards
from common.utils import save_results,make_dir from common.utils import save_results,make_dir
from PPO.agent import PPO from ppo2 import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig: class Config:
def __init__(self) -> None: def __init__(self) -> None:
self.algo = "DQN" # 算法名称 ################################## 环境超参数 ###################################
self.algo_name = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称 self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作 self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数 self.train_eps = 200 # 训练的回合数
self.test_eps = 20 # 测试的回合数 self.test_eps = 20 # 测试的回合数
self.batch_size = 5 ################################################################################
self.gamma=0.99
################################## 算法超参数 ####################################
self.batch_size = 5 # mini-batch SGD中的批量大小
self.gamma = 0.95 # 强化学习中的折扣因子
self.n_epochs = 4 self.n_epochs = 4
self.actor_lr = 0.0003 self.actor_lr = 0.0003 # actor的学习率
self.critic_lr = 0.0003 self.critic_lr = 0.0003 # critic的学习率
self.gae_lambda=0.95 self.gae_lambda = 0.95
self.policy_clip=0.2 self.policy_clip = 0.2
self.hidden_dim = 256 self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update self.update_fre = 20 # 策略更新频率
################################################################################
class PlotConfig:
def __init__(self) -> None: ################################# 保存结果相关参数 ################################
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \ self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径 '/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \ self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径 '/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片 self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
if cfg.continuous:
n_actions = env.action_space.shape[0] # 动作维度
else:
n_actions = env.action_space.n # 动作维度
agent = PPO(n_states, n_actions, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def env_agent_config(cfg,seed=1): def train(cfg,env,agent):
env = gym.make(cfg.env_name) print('开始训练!')
env.seed(seed) print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
state_dim = env.observation_space.shape[0] rewards = [] # 记录所有回合的奖励
action_dim = env.action_space.n ma_rewards = [] # 记录所有回合的滑动平均奖励
agent = PPO(state_dim,action_dim,cfg) steps = 0
return env,agent for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
steps += 1
ep_reward += reward
agent.memory.push(state, action, prob, val, reward, done)
if steps % cfg.update_fre == 0:
agent.update()
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('完成训练!')
return rewards,ma_rewards
cfg = PPOConfig() def test(cfg,env,agent):
plot_cfg = PlotConfig() print('开始测试!')
# 训练 print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
env,agent = env_agent_config(cfg,seed=1) rewards = [] # 记录所有回合的奖励
rewards, ma_rewards = train(cfg, env, agent) ma_rewards = [] # 记录所有回合的滑动平均奖励
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 for i_ep in range(cfg.test_eps):
agent.save(path=plot_cfg.model_path) state = env.reset()
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) done = False
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") ep_reward = 0
# 测试 while not done:
env,agent = env_agent_config(cfg,seed=10) action, prob, val = agent.choose_action(state)
agent.load(path=plot_cfg.model_path) state_, reward, done, _ = env.step(action)
rewards,ma_rewards = eval(cfg,env,agent) ep_reward += reward
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) state = state_
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.test_eps, ep_reward))
print('完成训练!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env,agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg)
agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,cfg,tag="test")

View File

@@ -6,10 +6,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
import gym import gym
import torch import torch
import datetime import datetime
from common.plot import plot_rewards from common.utils import plot_rewards
from common.utils import save_results,make_dir from common.utils import save_results,make_dir
from PPO.agent import PPO from ppo2 import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
@@ -45,9 +44,9 @@ class PlotConfig:
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name) env = gym.make(cfg.env_name)
env.seed(seed) env.seed(seed)
state_dim = env.observation_space.shape[0] n_states = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] n_actions = env.action_space.shape[0]
agent = PPO(state_dim,action_dim,cfg) agent = PPO(n_states,n_actions,cfg)
return env,agent return env,agent

File diff suppressed because one or more lines are too long

View File

@@ -1,121 +0,0 @@
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = 0
for i_ep in range(cfg.train_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
steps += 1
ep_reward += reward
agent.memory.push(state, action, prob, val, reward, done)
if steps % cfg.update_fre == 0:
agent.update()
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}")
print('完成训练!')
return rewards,ma_rewards
def eval(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
state = env.reset()
done = False
ep_reward = 0
while not done:
action, prob, val = agent.choose_action(state)
state_, reward, done, _ = env.step(action)
ep_reward += reward
state = state_
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.test_eps, ep_reward))
print('完成训练!')
return rewards,ma_rewards
if __name__ == '__main__':
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.plot import plot_rewards
from common.utils import save_results,make_dir
from PPO.agent import PPO
from PPO.train import train
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PPOConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 200 # 训练的回合数
self.test_eps = 20 # 测试的回合数
self.batch_size = 5
self.gamma=0.99
self.n_epochs = 4
self.actor_lr = 0.0003
self.critic_lr = 0.0003
self.gae_lambda=0.95
self.policy_clip=0.2
self.hidden_dim = 256
self.update_fre = 20 # frequency of agent update
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
agent = PPO(state_dim,action_dim,cfg)
return env,agent
cfg = PPOConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")

View File

@@ -1,31 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2021-03-23 16:35:58
LastEditor: John
LastEditTime: 2021-12-21 23:21:26
Discription:
Environment:
'''
import torch.nn as nn
import torch.nn.functional as F
class MLP(nn.Module):
''' 多层感知机
输入state维度
输出:概率
'''
def __init__(self,input_dim,hidden_dim = 36):
super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据input_dim, action_dim的情况来改变
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

View File

@@ -5,21 +5,41 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44 Date: 2020-11-22 23:27:44
LastEditor: John LastEditor: John
LastEditTime: 2021-10-16 00:43:52 LastEditTime: 2022-02-10 01:25:27
Discription: Discription:
Environment: Environment:
''' '''
import torch import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Bernoulli from torch.distributions import Bernoulli
from torch.autograd import Variable from torch.autograd import Variable
import numpy as np import numpy as np
from PolicyGradient.model import MLP
class MLP(nn.Module):
''' 多层感知机
输入state维度
输出概率
'''
def __init__(self,input_dim,hidden_dim = 36):
super(MLP, self).__init__()
# 24和36为hidden layer的层数可根据input_dim, n_actions的情况来改变
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x))
return x
class PolicyGradient: class PolicyGradient:
def __init__(self, state_dim,cfg): def __init__(self, n_states,cfg):
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim) self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size

View File

@@ -0,0 +1,152 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-02-10 06:13:21
Discription:
Environment:
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from itertools import count
from pg import PolicyGradient
from common.utils import save_results, make_dir
from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = "PolicyGradient" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 300 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.batch_size = 8 # mini-batch SGD中的批量大小
self.lr = 0.01 # 学习率
self.gamma = 0.99 # 强化学习中的折扣因子
self.hidden_dim = 36 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
n_states = env.observation_space.shape[0]
agent = PolicyGradient(n_states,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
state_pool = [] # 存放每batch_size个episode的state序列
action_pool = []
reward_pool = []
rewards = []
ma_rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state_pool.append(state)
action_pool.append(float(action))
reward_pool.append(reward)
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
break
if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool)
state_pool = [] # 每个episode的state
action_pool = []
reward_pool = []
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
env.close()
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

View File

@@ -1,136 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2021-10-16 00:34:13
Discription:
Environment:
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加父路径到系统路径sys.path
import gym
import torch
import datetime
from itertools import count
from PolicyGradient.agent import PolicyGradient
from common.plot import plot_rewards
from common.utils import save_results,make_dir
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class PGConfig:
def __init__(self):
self.algo = "PolicyGradient" # 算法名称
self.env = 'CartPole-v0' # 环境名称
self.result_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env + \
'/'+curr_time+'/models/' # 保存模型的路径
self.train_eps = 300 # 训练的回合数
self.test_eps = 30 # 测试的回合数
self.batch_size = 8
self.lr = 0.01 # 学习率
self.gamma = 0.99
self.hidden_dim = 36 # dimmension of hidden layer
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check gpu
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env)
env.seed(seed)
state_dim = env.observation_space.shape[0]
agent = PolicyGradient(state_dim,cfg)
return env,agent
def train(cfg,env,agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
state_pool = [] # 存放每batch_size个episode的state序列
action_pool = []
reward_pool = []
rewards = []
ma_rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state_pool.append(state)
action_pool.append(float(action))
reward_pool.append(reward)
state = next_state
if done:
print('Episode:', i_ep, ' Reward:', ep_reward)
break
if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool)
state_pool = [] # 每个episode的state
action_pool = []
reward_pool = []
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('complete training')
return rewards, ma_rewards
def eval(cfg,env,agent):
print('Start to eval !')
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print('Episode:', i_ep, ' Reward:', ep_reward)
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('complete evaling')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = PGConfig()
# train
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, tag="train",
algo=cfg.algo, path=cfg.result_path)
# eval
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path)
rewards,ma_rewards = eval(cfg,env,agent)
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Some files were not shown because too many files have changed in this diff Show More