This commit is contained in:
johnjim0816
2022-07-13 23:52:05 +08:00
parent 45cc4aff58
commit bab7f6fe8c
66 changed files with 247 additions and 841 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 62 KiB

View File

@@ -0,0 +1,14 @@
------------------ start ------------------
algo_name : A2C
env_name : CartPole-v0
n_envs : 8
max_steps : 30000
n_steps : 5
gamma : 0.99
lr : 0.001
hidden_dim : 256
result_path : c:\Users\24438\Desktop\rl-tutorials\codes\A2C/outputs/CartPole-v0/20220713-221850/results/
model_path : c:\Users\24438\Desktop\rl-tutorials\codes\A2C/outputs/CartPole-v0/20220713-221850/models/
save_fig : True
device : cuda
------------------- end -------------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

View File

@@ -1,45 +1,43 @@
import sys import sys,os
import os curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 parent_path = os.path.dirname(curr_path) # parent path
parent_path = os.path.dirname(curr_path) # 父路径 sys.path.append(parent_path) # add to system path
sys.path.append(parent_path) # 添加路径到系统路径
import gym import gym
import numpy as np import numpy as np
import torch import torch
import torch.optim as optim import torch.optim as optim
import datetime import datetime
import argparse
from common.multiprocessing_env import SubprocVecEnv from common.multiprocessing_env import SubprocVecEnv
from a2c import ActorCritic from a2c import ActorCritic
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards from common.utils import plot_rewards, save_args
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'A2C' # 算法名称
env_name = 'CartPole-v0' # 环境名称
class A2CConfig: def get_args():
def __init__(self) -> None: """ Hyperparameters
self.algo_name = algo_name# 算法名称 """
self.env_name = env_name # 环境名称 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
self.n_envs = 8 # 异步的环境数目 parser = argparse.ArgumentParser(description="hyperparameters")
self.gamma = 0.99 # 强化学习中的折扣因子 parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
self.hidden_dim = 256 parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
self.lr = 1e-3 # learning rate parser.add_argument('--n_envs',default=8,type=int,help="numbers of environments")
self.max_frames = 30000
self.n_steps = 5 parser.add_argument('--max_steps',default=20000,type=int,help="episodes of training")
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") parser.add_argument('--n_steps',default=5,type=int,help="episodes of testing")
class PlotConfig: parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
def __init__(self) -> None: parser.add_argument('--lr',default=1e-3,type=float,help="learning rate")
self.algo_name = algo_name # 算法名称 parser.add_argument('--hidden_dim',default=256,type=int)
self.env_name = env_name # 环境名称 parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU '/' + curr_time + '/results/' )
self.result_path = curr_path+"/outputs/" + self.env_name + \ parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/'+curr_time+'/results/' # 保存结果的路径 '/' + curr_time + '/models/' ) # path to save models
self.model_path = curr_path+"/outputs/" + self.env_name + \ parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
'/'+curr_time+'/models/' # 保存模型的路径 args = parser.parse_args()
self.save = True # 是否保存图片 args.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
return args
def make_envs(env_name): def make_envs(env_name):
def _thunk(): def _thunk():
@@ -60,6 +58,7 @@ def test_env(env,model,vis=False):
if vis: env.render() if vis: env.render()
total_reward += reward total_reward += reward
return total_reward return total_reward
def compute_returns(next_value, rewards, masks, gamma=0.99): def compute_returns(next_value, rewards, masks, gamma=0.99):
R = next_value R = next_value
returns = [] returns = []
@@ -70,19 +69,19 @@ def compute_returns(next_value, rewards, masks, gamma=0.99):
def train(cfg,envs): def train(cfg,envs):
print('开始训练!') print('Start training!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
env = gym.make(cfg.env_name) # a single env env = gym.make(cfg.env_name) # a single env
env.seed(10) env.seed(10)
n_states = envs.observation_space.shape[0] n_states = envs.observation_space.shape[0]
n_actions = envs.action_space.n n_actions = envs.action_space.n
model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
optimizer = optim.Adam(model.parameters()) optimizer = optim.Adam(model.parameters())
frame_idx = 0 step_idx = 0
test_rewards = [] test_rewards = []
test_ma_rewards = [] test_ma_rewards = []
state = envs.reset() state = envs.reset()
while frame_idx < cfg.max_frames: while step_idx < cfg.max_steps:
log_probs = [] log_probs = []
values = [] values = []
rewards = [] rewards = []
@@ -101,16 +100,16 @@ def train(cfg,envs):
rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device))
masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device))
state = next_state state = next_state
frame_idx += 1 step_idx += 1
if frame_idx % 100 == 0: if step_idx % 100 == 0:
test_reward = np.mean([test_env(env,model) for _ in range(10)]) test_reward = np.mean([test_env(env,model) for _ in range(10)])
print(f"frame_idx:{frame_idx}, test_reward:{test_reward}") print(f"step_idx:{step_idx}, test_reward:{test_reward}")
test_rewards.append(test_reward) test_rewards.append(test_reward)
if test_ma_rewards: if test_ma_rewards:
test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward) test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward)
else: else:
test_ma_rewards.append(test_reward) test_ma_rewards.append(test_reward)
# plot(frame_idx, test_rewards) # plot(step_idx, test_rewards)
next_state = torch.FloatTensor(next_state).to(cfg.device) next_state = torch.FloatTensor(next_state).to(cfg.device)
_, next_value = model(next_state) _, next_value = model(next_state)
returns = compute_returns(next_value, rewards, masks) returns = compute_returns(next_value, rewards, masks)
@@ -124,15 +123,15 @@ def train(cfg,envs):
optimizer.zero_grad() optimizer.zero_grad()
loss.backward() loss.backward()
optimizer.step() optimizer.step()
print('完成训练') print('Finish training')
return test_rewards, test_ma_rewards return test_rewards, test_ma_rewards
if __name__ == "__main__": if __name__ == "__main__":
cfg = A2CConfig() cfg = get_args()
plot_cfg = PlotConfig()
envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)] envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)]
envs = SubprocVecEnv(envs) envs = SubprocVecEnv(envs)
# 训练 # training
rewards,ma_rewards = train(cfg,envs) rewards,ma_rewards = train(cfg,envs)
make_dir(plot_cfg.result_path,plot_cfg.model_path) make_dir(cfg.result_path,cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 save_args(cfg)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果

View File

@@ -1,7 +0,0 @@
# DDPG
#TODO
## 伪代码
![image-20210320151900695](assets/image-20210320151900695.png)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 259 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

View File

@@ -0,0 +1,18 @@
------------------ start ------------------
algo_name : DDPG
env_name : Pendulum-v1
train_eps : 300
test_eps : 20
gamma : 0.99
critic_lr : 0.001
actor_lr : 0.0001
memory_capacity : 8000
batch_size : 128
target_update : 2
soft_tau : 0.01
hidden_dim : 256
result_path : c:\Users\24438\Desktop\rl-tutorials\codes\DDPG/outputs/Pendulum-v1/20220713-225402/results/
model_path : c:\Users\24438\Desktop\rl-tutorials\codes\DDPG/outputs/Pendulum-v1/20220713-225402/models/
save_fig : True
device : cuda
------------------- end -------------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

View File

@@ -5,59 +5,51 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21 @Date: 2020-06-11 20:58:21
@LastEditor: John @LastEditor: John
LastEditTime: 2022-06-09 19:05:20 LastEditTime: 2022-07-13 22:53:11
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import sys,os import sys,os
os.environ['KMP_DUPLICATE_LIB_OK']='True' curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 parent_path = os.path.dirname(curr_path) # parent path
parent_path = os.path.dirname(curr_path) # 父路径 sys.path.append(parent_path) # add to system path
sys.path.append(parent_path) # 添加路径到系统路径sys.path
import datetime import datetime
import gym import gym
import torch import torch
import argparse
from env import NormalizedActions,OUNoise from env import NormalizedActions,OUNoise
from ddpg import DDPG from ddpg import DDPG
from common.utils import save_results,make_dir from common.utils import save_results,make_dir
from common.utils import plot_rewards from common.utils import plot_rewards,save_args
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 def get_args():
class Config: """ Hyperparameters
'''超参数 """
''' curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
def __init__(self): parser.add_argument('--algo_name',default='DDPG',type=str,help="name of algorithm")
################################## 环境超参数 ################################### parser.add_argument('--env_name',default='Pendulum-v1',type=str,help="name of environment")
self.algo_name = 'DDPG' # 算法名称 parser.add_argument('--train_eps',default=300,type=int,help="episodes of training")
self.env_name = 'Pendulum-v1' # 环境名称gym新版本约0.21.0之后中Pendulum-v0改为Pendulum-v1 parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
self.device = torch.device( parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic")
self.seed = 10 # 随机种子置0则不设置随机种子 parser.add_argument('--actor_lr',default=1e-4,type=float,help="learning rate of actor")
self.train_eps = 300 # 训练的回合数 parser.add_argument('--memory_capacity',default=8000,type=int,help="memory capacity")
self.test_eps = 20 # 测试的回合数 parser.add_argument('--batch_size',default=128,type=int)
################################################################################ parser.add_argument('--target_update',default=2,type=int)
parser.add_argument('--soft_tau',default=1e-2,type=float)
################################## 算法超参数 ################################### parser.add_argument('--hidden_dim',default=256,type=int)
self.gamma = 0.99 # 折扣因子 parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
self.critic_lr = 1e-3 # 评论家网络的学习率 '/' + curr_time + '/results/' )
self.actor_lr = 1e-4 # 演员网络的学习率 parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
self.memory_capacity = 8000 # 经验回放的容量 '/' + curr_time + '/models/' ) # path to save models
self.batch_size = 128 # mini-batch SGD中的批量大小 parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
self.target_update = 2 # 目标网络的更新频率 args = parser.parse_args()
self.hidden_dim = 256 # 网络隐藏层维度 args.device = torch.device(
self.soft_tau = 1e-2 # 软更新参数 "cuda" if torch.cuda.is_available() else "cpu") # check GPU
################################################################################ return args
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
@@ -67,9 +59,9 @@ def env_agent_config(cfg,seed=1):
agent = DDPG(n_states,n_actions,cfg) agent = DDPG(n_states,n_actions,cfg)
return env,agent return env,agent
def train(cfg, env, agent): def train(cfg, env, agent):
print('开始训练!') print('Start training!')
print(f'环境:{cfg.env_name},算法:{cfg.algo_name},设备:{cfg.device}') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声 ou_noise = OUNoise(env.action_space) # noise of action
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps): for i_ep in range(cfg.train_eps):
@@ -88,18 +80,18 @@ def train(cfg, env, agent):
agent.update() agent.update()
state = next_state state = next_state
if (i_ep+1)%10 == 0: if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) print(f'Env:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
rewards.append(ep_reward) rewards.append(ep_reward)
if ma_rewards: if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print('完成训练!') print('Finish training!')
return rewards, ma_rewards return rewards, ma_rewards
def test(cfg, env, agent): def test(cfg, env, agent):
print('开始测试!') print('Start testing')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps): for i_ep in range(cfg.test_eps):
@@ -113,25 +105,25 @@ def test(cfg, env, agent):
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
state = next_state state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward) rewards.append(ep_reward)
if ma_rewards: if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}")
print('完成测试!') print('Finish testing!')
return rewards, ma_rewards return rewards, ma_rewards
if __name__ == "__main__": if __name__ == "__main__":
cfg = Config() cfg = get_args()
# 训练 # training
env,agent = env_agent_config(cfg,seed=1) env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent) rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) make_dir(cfg.result_path, cfg.model_path)
save_args(cfg)
agent.save(path=cfg.model_path) agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试 # testing
env,agent = env_agent_config(cfg,seed=10) env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path) agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent) rewards,ma_rewards = test(cfg,env,agent)

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2022-03-02 11:05:11 LastEditTime: 2022-07-13 00:08:18
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -20,7 +20,22 @@ import random
import math import math
import numpy as np import numpy as np
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class ReplayBuffer: class ReplayBuffer:
def __init__(self, capacity): def __init__(self, capacity):
@@ -47,7 +62,7 @@ class ReplayBuffer:
return len(self.buffer) return len(self.buffer)
class DQN: class DQN:
def __init__(self, n_actions,model,cfg): def __init__(self, n_states,n_actions,cfg):
self.n_actions = n_actions # 总的动作个数 self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = cfg.device # 设备cpu或gpu等
@@ -58,8 +73,8 @@ class DQN:
(cfg.epsilon_start - cfg.epsilon_end) * \ (cfg.epsilon_start - cfg.epsilon_end) * \
math.exp(-1. * frame_idx / cfg.epsilon_decay) math.exp(-1. * frame_idx / cfg.epsilon_decay)
self.batch_size = cfg.batch_size self.batch_size = cfg.batch_size
self.policy_net = model.to(self.device) self.policy_net = MLP(n_states,n_actions).to(self.device)
self.target_net = model.to(self.device) self.target_net = MLP(n_states,n_actions).to(self.device)
for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
target_param.data.copy_(param.data) target_param.data.copy_(param.data)
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -0,0 +1,19 @@
------------------ start ------------------
algo_name : DQN
env_name : CartPole-v0
train_eps : 200
test_eps : 20
gamma : 0.95
epsilon_start : 0.95
epsilon_end : 0.01
epsilon_decay : 500
lr : 0.0001
memory_capacity : 100000
batch_size : 64
target_update : 4
hidden_dim : 256
result_path : C:\Users\24438\Desktop\rl-tutorials\codes\DQN/outputs/CartPole-v0/20220713-211653/results/
model_path : C:\Users\24438\Desktop\rl-tutorials\codes\DQN/outputs/CartPole-v0/20220713-211653/models/
save_fig : True
device : cuda
------------------- end -------------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 58 KiB

View File

@@ -1,5 +1,7 @@
from lib2to3.pytree import type_repr
import sys import sys
import os import os
from parso import parse
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
@@ -10,86 +12,58 @@ import gym
import torch import torch
import datetime import datetime
import numpy as np import numpy as np
import argparse
from common.utils import save_results_1, make_dir from common.utils import save_results_1, make_dir
from common.utils import plot_rewards from common.utils import plot_rewards,save_args
from dqn import DQN from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 def get_args():
""" Hyperparameters
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
args.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
return args
class MLP(nn.Module): def env_agent_config(cfg,seed=1):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class Config:
'''超参数
'''
def __init__(self):
############################### hyperparameters ################################
self.algo_name = 'DQN' # algorithm name
self.env_name = 'CartPole-v0' # environment name
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数
self.test_eps = 20 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.95 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.0001 # 学习率
self.memory_capacity = 100000 # 经验回放的容量
self.batch_size = 64 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体 ''' 创建环境和智能体
''' '''
env = gym.make(cfg.env_name) # 创建环境 env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度 n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度 n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}") print(f"n states: {n_states}, n actions: {n_actions}")
model = MLP(n_states,n_actions) agent = DQN(n_states,n_actions, cfg) # 创建智能体
agent = DQN(n_actions, model, cfg) # 创建智能体 if seed !=0: # 设置随机种子
if cfg.seed !=0: # 设置随机种子 torch.manual_seed(seed)
torch.manual_seed(cfg.seed) env.seed(seed)
env.seed(cfg.seed) np.random.seed(seed)
np.random.seed(cfg.seed)
return env, agent return env, agent
def train(cfg, env, agent): def train(cfg, env, agent):
''' 训练 ''' Training
''' '''
print('开始训练!') print('Start training!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = [] steps = []
@@ -117,7 +91,7 @@ def train(cfg, env, agent):
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0: if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}') print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('Finish training!') print('Finish training!')
env.close() env.close()
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
@@ -152,18 +126,19 @@ def test(cfg, env, agent):
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}') print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
print('完成测试!') print('完成测试!')
env.close() env.close()
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
if __name__ == "__main__": if __name__ == "__main__":
cfg = Config() cfg = get_args()
# 训练 # 训练
env, agent = env_agent_config(cfg) env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent) res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
save_args(cfg)
agent.save(path=cfg.model_path) # 保存模型 agent.save(path=cfg.model_path) # 保存模型
save_results_1(res_dic, tag='train', save_results_1(res_dic, tag='train',
path=cfg.result_path) # 保存结果 path=cfg.result_path) # 保存结果

View File

@@ -1,168 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17
LastEditor: JiangJi
LastEditTime: 2022-06-18 20:12:20
Discription: 使用 Nature DQN 训练 CartPole-v1
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import torch.nn as nn
import torch.nn.functional as F
from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = "DQN" # 算法名称
env_name = 'CartPole-v1' # 环境名称
class DQNConfig:
''' 算法相关参数设置
'''
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 300 # 训练的回合数
self.test_eps = 20 # 测试的回合数
# 超参数
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
self.epsilon_end = 0.005 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.0001 # 学习率
self.memory_capacity = 100000 # 经验回放的容量
self.batch_size = 128 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
class PlotConfig:
''' 绘图相关参数设置
'''
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=128):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
def env_agent_config(cfg, seed=1):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
model = MLP(n_states,n_actions)
agent = DQN(n_actions,model,cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

View File

@@ -1,150 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-12-22 11:14:17
LastEditor: JiangJi
LastEditTime: 2022-02-10 06:17:46
Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from common.utils import save_results, make_dir
from common.utils import plot_rewards, plot_rewards_cn
from common.atari_wrappers import make_atari, wrap_deepmind
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DQN-cnn' # 算法名称
env_name = 'PongNoFrameskip-v4' # 环境名称
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
class DQNConfig:
''' 算法相关参数设置
'''
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = device # 检测GPU
self.train_eps = 500 # 训练的回合数
self.test_eps = 30 # 测试的回合数
# 超参数
self.gamma = 0.95 # 强化学习中的折扣因子
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
self.lr = 0.0001 # 学习率
self.memory_capacity = 100000 # 经验回放的容量
self.batch_size = 64 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层
class PlotConfig:
''' 绘图相关参数设置
'''
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = device # 检测GPU
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
def env_agent_config(cfg, seed=1):
''' 创建环境和智能体
'''
env = make_atari(cfg.env_name) # 创建环境
# env = wrap_deepmind(env)
# env = wrap_pytorch(env)
env.seed(seed) # 设置随机种子
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
agent = DQN(n_states, n_actions, cfg) # 创建智能体
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward, next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep+1)%10 == 0:
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
print('完成训练!')
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNConfig()
plot_cfg = PlotConfig()
# 训练
env, agent = env_agent_config(cfg, seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=plot_cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg, seed=10)
agent.load(path=plot_cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=plot_cfg.result_path) # 保存结果
plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

View File

@@ -1,180 +0,0 @@
import sys
import os
import torch.nn as nn
import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results_1, make_dir
from common.utils import plot_rewards
from dqn_1 import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class MLP(nn.Module):
def __init__(self, n_states,n_actions,hidden_dim=256):
""" 初始化q网络为全连接网络
n_states: 输入的特征数即环境的状态维度
n_actions: 输出的动作维度
"""
super(MLP, self).__init__()
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
def forward(self, x):
# 各层对应的激活函数
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
return self.fc4(x)
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
# self.env_name = 'Breakout-ram-v0' # 环境名称
self.env_name = 'ALE/Pong-ram-v5'
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 5 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 500000 # e-greedy策略中epsilon的衰减率
self.lr = 0.00025 # 学习率
self.memory_capacity = int(5e4) # 经验回放的容量
self.batch_size = 32 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
model = MLP(n_states,n_actions)
agent = DQN(n_states, n_actions, model, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
ep_step = 0
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!')
env.close()
res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
return res_dic
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
steps = []
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
ep_step = 0
state = env.reset() # 重置环境,返回初始状态
while True:
ep_step+=1
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
steps.append(ep_step)
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
env.close()
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results_1(res_dic, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results_1(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果

View File

@@ -1,149 +0,0 @@
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import numpy as np
from common.utils import save_results, make_dir
from common.utils import plot_rewards
from dqn import DQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = 'DQN' # 算法名称
self.env_name = 'SpaceInvaders-ram-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 200 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.gamma = 0.99 # 强化学习中的折扣因子
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
self.epsilon_decay = 20000 # e-greedy策略中epsilon的衰减率
self.lr = 2e-4 # 学习率
self.memory_capacity = int(1e5) # 经验回放的容量
self.batch_size = 32 # mini-batch SGD中的批量大小
self.target_update = 4 # 目标网络的更新频率
self.hidden_dim = 512 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg):
''' 创建环境和智能体
'''
env = gym.make(cfg.env_name) # 创建环境
n_states = env.observation_space.shape[0] # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"n states: {n_states}, n actions: {n_actions}")
agent = DQN(n_states, n_actions, cfg) # 创建智能体
if cfg.seed !=0: # 设置随机种子
torch.manual_seed(cfg.seed)
env.seed(cfg.seed)
np.random.seed(cfg.seed)
return env, agent
def train(cfg, env, agent):
''' 训练
'''
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
agent.memory.push(state, action, reward,
next_state, done) # 保存transition
state = next_state # 更新下一个状态
agent.update() # 更新智能体
ep_reward += reward # 累加奖励
if done:
break
if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新
agent.target_net.load_state_dict(agent.policy_net.state_dict())
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
else:
ma_rewards.append(ep_reward)
if (i_ep + 1) % 1 == 0:
print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
print('完成训练!')
env.close()
return rewards, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
################################################################################
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录一回合内的奖励
state = env.reset() # 重置环境,返回初始状态
while True:
action = agent.choose_action(state) # 选择动作
next_state, reward, done, _ = env.step(action) # 更新环境返回transition
state = next_state # 更新下一个状态
ep_reward += reward # 累加奖励
if done:
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00 Date: 2020-09-11 23:03:00
LastEditor: John LastEditor: John
LastEditTime: 2022-02-10 00:54:02 LastEditTime: 2022-06-21 19:36:05
Discription: Discription:
Environment: Environment:
''' '''
@@ -84,8 +84,6 @@ def train(cfg,env,agent):
def test(cfg,env,agent): def test(cfg,env,agent):
print('开始测试!') print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
for item in agent.Q_table.items():
print(item)
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 滑动平均的奖励 ma_rewards = [] # 滑动平均的奖励
for i_ep in range(cfg.test_eps): for i_ep in range(cfg.test_eps):

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-12 16:02:24 Date: 2021-03-12 16:02:24
LastEditor: John LastEditor: John
LastEditTime: 2022-02-28 11:50:11 LastEditTime: 2022-07-13 22:15:46
Discription: Discription:
Environment: Environment:
''' '''
@@ -27,33 +27,33 @@ def chinese_font():
font = None font = None
return font return font
def plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag='train'): def plot_rewards_cn(rewards, ma_rewards, cfg, tag='train'):
''' 中文画图 ''' 中文画图
''' '''
sns.set() sns.set()
plt.figure() plt.figure()
plt.title(u"{}环境下{}算法的学习曲线".format(plot_cfg.env_name, plt.title(u"{}环境下{}算法的学习曲线".format(cfg.env_name,
plot_cfg.algo_name), fontproperties=chinese_font()) cfg.algo_name), fontproperties=chinese_font())
plt.xlabel(u'回合数', fontproperties=chinese_font()) plt.xlabel(u'回合数', fontproperties=chinese_font())
plt.plot(rewards) plt.plot(rewards)
plt.plot(ma_rewards) plt.plot(ma_rewards)
plt.legend((u'奖励', u'滑动平均奖励',), loc="best", prop=chinese_font()) plt.legend((u'奖励', u'滑动平均奖励',), loc="best", prop=chinese_font())
if plot_cfg.save: if cfg.save:
plt.savefig(plot_cfg.result_path+f"{tag}_rewards_curve_cn") plt.savefig(cfg.result_path+f"{tag}_rewards_curve_cn")
# plt.show() # plt.show()
def plot_rewards(rewards, ma_rewards, plot_cfg, tag='train'): def plot_rewards(rewards, ma_rewards, cfg, tag='train'):
sns.set() sns.set()
plt.figure() # 创建一个图形实例,方便同时多画几个图 plt.figure() # 创建一个图形实例,方便同时多画几个图
plt.title("learning curve on {} of {} for {}".format( plt.title("learning curve on {} of {} for {}".format(
plot_cfg.device, plot_cfg.algo_name, plot_cfg.env_name)) cfg.device, cfg.algo_name, cfg.env_name))
plt.xlabel('epsiodes') plt.xlabel('epsiodes')
plt.plot(rewards, label='rewards') plt.plot(rewards, label='rewards')
plt.plot(ma_rewards, label='ma rewards') plt.plot(ma_rewards, label='ma rewards')
plt.legend() plt.legend()
if plot_cfg.save: if cfg.save_fig:
plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) plt.savefig(cfg.result_path+"{}_rewards_curve".format(tag))
plt.show() plt.show()
@@ -80,7 +80,7 @@ def save_results(rewards, ma_rewards, tag='train', path='./results'):
''' '''
np.save(path+'{}_rewards.npy'.format(tag), rewards) np.save(path+'{}_rewards.npy'.format(tag), rewards)
np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards)
print('结果保存完毕!') print('Result saved!')
def make_dir(*paths): def make_dir(*paths):
@@ -98,3 +98,14 @@ def del_empty_dir(*paths):
for dir in dirs: for dir in dirs:
if not os.listdir(os.path.join(path, dir)): if not os.listdir(os.path.join(path, dir)):
os.removedirs(os.path.join(path, dir)) os.removedirs(os.path.join(path, dir))
def save_args(args):
# save parameters
argsDict = args.__dict__
with open(args.result_path+'params.txt', 'w') as f:
f.writelines('------------------ start ------------------' + '\n')
for eachArg, value in argsDict.items():
f.writelines(eachArg + ' : ' + str(value) + '\n')
f.writelines('------------------- end -------------------')
print("Parameters saved!")

19
notebooks/QLearning.ipynb Normal file
View File

@@ -0,0 +1,19 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}