update codes

This commit is contained in:
johnjim0816
2021-12-28 18:46:52 +08:00
parent 41fb561d25
commit bd51b5a7ad
52 changed files with 305 additions and 292 deletions

View File

@@ -50,15 +50,15 @@ import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
def __init__(self, n_states=4, n_actions=18):
def __init__(self, state_dim=4, action_dim=18):
""" 初始化q网络为全连接网络
n_states: 输入的feature即环境的state数目
n_actions: 输出的action总个数
state_dim: 输入的feature即环境的state数目
action_dim: 输出的action总个数
"""
super(FCN, self).__init__()
self.fc1 = nn.Linear(n_states, 128) # 输入层
self.fc1 = nn.Linear(state_dim, 128) # 输入层
self.fc2 = nn.Linear(128, 128) # 隐藏层
self.fc3 = nn.Linear(128, n_actions) # 输出层
self.fc3 = nn.Linear(128, action_dim) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
x = F.relu(self.fc2(x))
return self.fc3(x)
```
输入为n_states,输出为n_actions包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
输入为state_dim输出为action_dim包含一个128维度的隐藏层这里根据需要可增加隐藏层维度和数量然后一般使用relu激活函数这里跟深度学习的网路设置是一样的。
### Replay Buffer
@@ -107,8 +107,8 @@ class ReplayBuffer:
在类中建立两个网络以及optimizer和memory
```python
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
if random.random() > self.epsilon(self.frame_idx):
action = self.predict(state)
else:
action = random.randrange(self.n_actions)
action = random.randrange(self.action_dim)
return action
```

View File

@@ -21,15 +21,15 @@ import math
import numpy as np
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
def __init__(self, state_dim,action_dim,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态
n_actions: 输出的动作维度
state_dim: 输入的特征数即环境的状态维度
action_dim: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
def forward(self, x):
# 各层对应的激活函数
@@ -62,9 +62,9 @@ class ReplayBuffer:
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
def __init__(self, state_dim, action_dim, cfg):
self.n_actions = n_actions # 总的动作个数
self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -90,7 +90,7 @@ class DQN:
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
action = random.randrange(self.action_dim)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略

View File

@@ -70,9 +70,9 @@ class ReplayBuffer:
return len(self.buffer)
class DQN:
def __init__(self, n_states, n_actions, cfg):
def __init__(self, state_dim, action_dim, cfg):
self.n_actions = n_actions # 总的动作个数
self.action_dim = action_dim # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等
self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数
@@ -81,8 +81,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size
self.policy_net = CNN(n_states, n_actions).to(self.device)
self.target_net = CNN(n_states, n_actions).to(self.device)
self.policy_net = CNN(state_dim, action_dim).to(self.device)
self.target_net = CNN(state_dim, action_dim).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -98,7 +98,7 @@ class DQN:
q_values = self.policy_net(state)
action = q_values.max(1)[1].item() # 选择Q值最大的动作
else:
action = random.randrange(self.n_actions)
action = random.randrange(self.action_dim)
return action
def update(self):
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时不更新策略

View File

@@ -7,23 +7,29 @@ sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from DQN.dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DQN' # 算法名称
env_name = 'CartPole-v0' # 环境名称
class DQNConfig:
class Config:
'''超参数
'''
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数
# 超参数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.95 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
@@ -33,99 +39,106 @@ class DQNConfig:
self.batch_size = 64 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
class PlotConfig:
def __init__(self) -> None:
self.algo = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg, seed=1):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态数
n_actions = env.action_space.n # 动作数
agent = DQN(n_states, n_actions, cfg) # 创建智能体
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
if seed !=0: # 设置随机种子
torch.manual_seed(seed)
env.seed(seed)
np.random.seed(seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
if (i_ep + 1) % 10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
def test(cfg,env,agent):
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
return rewards, ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
cfg = Config()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

View File

@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态
n_actions = env.action_space.n # 动作
agent = DQN(n_states, n_actions, cfg) # 创建智能体
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):

View File

@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
# env = wrap_deepmind(env)
# env = wrap_pytorch(env)
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态
n_actions = env.action_space.n # 动作
agent = DQN(n_states, n_actions, cfg) # 创建智能体
state_dim = env.observation_space.shape[0] # 状态维度
action_dim = env.action_space.n # 动作维度
agent = DQN(state_dim, action_dim, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):