diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py index e095bc5..997401b 100644 --- a/codes/A2C/agent.py +++ b/codes/A2C/agent.py @@ -10,12 +10,40 @@ Discription: Environment: ''' import torch.optim as optim -from A2C.model import ActorCritic +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Categorical + +class ActorCritic(nn.Module): + ''' A2C网络模型,包含一个Actor和Critic + ''' + def __init__(self, input_dim, output_dim, hidden_dim): + super(ActorCritic, self).__init__() + self.critic = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, 1) + ) + + self.actor = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, output_dim), + nn.Softmax(dim=1), + ) + + def forward(self, x): + value = self.critic(x) + probs = self.actor(x) + dist = Categorical(probs) + return dist, value class A2C: - def __init__(self,n_states,n_actions,cfg) -> None: + ''' A2C算法 + ''' + def __init__(self,state_dim,action_dim,cfg) -> None: self.gamma = cfg.gamma self.device = cfg.device - self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device) + self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters()) def compute_returns(self,next_value, rewards, masks): diff --git a/codes/A2C/model.py b/codes/A2C/model.py deleted file mode 100644 index 473bcb2..0000000 --- a/codes/A2C/model.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-05-03 21:38:54 -LastEditor: JiangJi -LastEditTime: 2021-05-03 21:40:06 -Discription: -Environment: -''' -import torch.nn as nn -import torch.nn.functional as F -from torch.distributions import Categorical -class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim): - super(ActorCritic, self).__init__() - - self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, 1) - ) - - self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, n_actions), - nn.Softmax(dim=1), - ) - - def forward(self, x): - value = self.critic(x) - probs = self.actor(x) - dist = Categorical(probs) - return dist, value \ No newline at end of file diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy new file mode 100644 index 0000000..6537afd Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy differ diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy new file mode 100644 index 0000000..56f779b Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy differ diff --git a/codes/A2C/task0_train.ipynb b/codes/A2C/task0.ipynb similarity index 100% rename from codes/A2C/task0_train.ipynb rename to codes/A2C/task0.ipynb diff --git a/codes/A2C/task0_train.py b/codes/A2C/task0.py similarity index 86% rename from codes/A2C/task0_train.py rename to codes/A2C/task0.py index 5927048..fd54d87 100644 --- a/codes/A2C/task0_train.py +++ b/codes/A2C/task0.py @@ -1,7 +1,8 @@ -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径sys.path +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 import gym import numpy as np @@ -9,15 +10,18 @@ import torch import torch.optim as optim import datetime from common.multiprocessing_env import SubprocVecEnv -from A2C.model import ActorCritic +from A2C.agent import ActorCritic from common.utils import save_results, make_dir -from common.plot import plot_rewards +from common.utils import plot_rewards + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'A2C' # 算法名称 +env_name = 'CartPole-v0' # 环境名称 -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time class A2CConfig: def __init__(self) -> None: - self.algo='A2C' # 算法名称 - self.env_name= 'CartPole-v0' # 环境名称 + self.algo_name = algo_name# 算法名称 + self.env_name = env_name # 环境名称 self.n_envs = 8 # 异步的环境数目 self.gamma = 0.99 # 强化学习中的折扣因子 self.hidden_dim = 256 @@ -27,10 +31,9 @@ class A2CConfig: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class PlotConfig: def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env_name + \ @@ -67,6 +70,8 @@ def compute_returns(next_value, rewards, masks, gamma=0.99): def train(cfg,envs): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') env = gym.make(cfg.env_name) # a single env env.seed(10) state_dim = envs.observation_space.shape[0] @@ -119,6 +124,7 @@ def train(cfg,envs): optimizer.zero_grad() loss.backward() optimizer.step() + print('完成训练!') return test_rewards, test_ma_rewards if __name__ == "__main__": cfg = A2CConfig() diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py index 528872e..01ded1c 100644 --- a/codes/DDPG/agent.py +++ b/codes/DDPG/agent.py @@ -9,22 +9,75 @@ LastEditTime: 2021-09-16 00:55:30 @Discription: @Environment: python 3.7.7 ''' +import random import numpy as np import torch import torch.nn as nn import torch.optim as optim - -from common.model import Actor, Critic -from common.memory import ReplayBuffer - - +import torch.nn.functional as F +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) +class Actor(nn.Module): + def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): + super(Actor, self).__init__() + self.linear1 = nn.Linear(n_states, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, n_actions) + + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, x): + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = torch.tanh(self.linear3(x)) + return x +class Critic(nn.Module): + def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): + super(Critic, self).__init__() + + self.linear1 = nn.Linear(n_states + n_actions, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, 1) + # 随机初始化为较小的值 + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, state, action): + # 按维数1拼接 + x = torch.cat([state, action], 1) + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = self.linear3(x) + return x class DDPG: - def __init__(self, state_dim, action_dim, cfg): + def __init__(self, n_states, n_actions, cfg): self.device = cfg.device - self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) - self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) - self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) - self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) # 复制参数到目标网络 for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py index 99da3c5..92fe482 100644 --- a/codes/DDPG/env.py +++ b/codes/DDPG/env.py @@ -16,12 +16,10 @@ class NormalizedActions(gym.ActionWrapper): ''' 将action范围重定在[0.1]之间 ''' def action(self, action): - low_bound = self.action_space.low upper_bound = self.action_space.high action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) action = np.clip(action, low_bound, upper_bound) - return action def reverse_action(self, action): diff --git a/codes/DDPG/task0.py b/codes/DDPG/task0.py new file mode 100644 index 0000000..33872f4 --- /dev/null +++ b/codes/DDPG/task0.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +@Author: John +@Email: johnjim0816@gmail.com +@Date: 2020-06-11 20:58:21 +@LastEditor: John +LastEditTime: 2021-09-16 01:31:33 +@Discription: +@Environment: python 3.7.7 +''' +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径sys.path + +import datetime +import gym +import torch + +from DDPG.env import NormalizedActions +from DDPG.agent import DDPG +from DDPG.train import train,test +from common.utils import save_results,make_dir +from common.utils import plot_rewards + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'DDPG' # 算法名称 +env_name = 'Pendulum-v1' # 环境名称,gym新版本(约0.21.0之后)中Pendulum-v0改为Pendulum-v1 + +class DDPGConfig: + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 300 # 训练的回合数 + self.eval_eps = 50 # 测试的回合数 + self.gamma = 0.99 # 折扣因子 + self.critic_lr = 1e-3 # 评论家网络的学习率 + self.actor_lr = 1e-4 # 演员网络的学习率 + self.memory_capacity = 8000 # 经验回放的容量 + self.batch_size = 128 # mini-batch SGD中的批量大小 + self.target_update = 2 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层维度 + self.soft_tau = 1e-2 # 软更新参数 + +class PlotConfig: + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + +def env_agent_config(cfg,seed=1): + env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 + env.seed(seed) # 随机种子 + n_states = env.observation_space.shape[0] + n_actions = env.action_space.shape[0] + agent = DDPG(n_states,n_actions,cfg) + return env,agent + +cfg = DDPGConfig() +plot_cfg = PlotConfig() +# 训练 +env,agent = env_agent_config(cfg,seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) +agent.save(path=plot_cfg.model_path) +save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 +# 测试 +env,agent = env_agent_config(cfg,seed=10) +agent.load(path=plot_cfg.model_path) +rewards,ma_rewards = test(plot_cfg,env,agent) +save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path) +plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 + diff --git a/codes/DDPG/task0_train.py b/codes/DDPG/task0_train.py deleted file mode 100644 index ea76661..0000000 --- a/codes/DDPG/task0_train.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 20:58:21 -@LastEditor: John -LastEditTime: 2021-09-16 01:31:33 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径sys.path - -import datetime -import gym -import torch - -from DDPG.env import NormalizedActions, OUNoise -from DDPG.agent import DDPG -from common.utils import save_results,make_dir -from common.plot import plot_rewards - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 - -class DDPGConfig: - def __init__(self): - self.algo = 'DDPG' # 算法名称 - self.env_name = 'Pendulum-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 300 # 训练的回合数 - self.eval_eps = 50 # 测试的回合数 - self.gamma = 0.99 # 折扣因子 - self.critic_lr = 1e-3 # 评论家网络的学习率 - self.actor_lr = 1e-4 # 演员网络的学习率 - self.memory_capacity = 8000 # 经验回放的容量 - self.batch_size = 128 # mini-batch SGD中的批量大小 - self.target_update = 2 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层维度 - self.soft_tau = 1e-2 # 软更新参数 - -class PlotConfig: - def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - -def env_agent_config(cfg,seed=1): - env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 - env.seed(seed) # 随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent = DDPG(n_states,n_actions,cfg) - return env,agent - -def train(cfg, env, agent): - print('开始训练!') - print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}') - ou_noise = OUNoise(env.action_space) # 动作噪声 - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - state = env.reset() - ou_noise.reset() - done = False - ep_reward = 0 - i_step = 0 - while not done: - i_step += 1 - action = agent.choose_action(state) - action = ou_noise.get_action(action, i_step) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - agent.update() - state = next_state - if (i_ep+1)%10 == 0: - print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('完成训练!') - return rewards, ma_rewards - -def eval(cfg, env, agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.eval_eps): - state = env.reset() - done = False - ep_reward = 0 - i_step = 0 - while not done: - i_step += 1 - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - state = next_state - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('完成测试!') - return rewards, ma_rewards - - -if __name__ == "__main__": - cfg = DDPGConfig() - plot_cfg = PlotConfig() - # 训练 - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) - agent.save(path=plot_cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") - # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=plot_cfg.model_path) - rewards,ma_rewards = eval(plot_cfg,env,agent) - save_results(rewards,ma_rewards,tag = 'eval',path = cfg.result_path) - plot_rewards(rewards,ma_rewards,plot_cfg,tag = "eval") - diff --git a/codes/DDPG/train.py b/codes/DDPG/train.py new file mode 100644 index 0000000..8554cd0 --- /dev/null +++ b/codes/DDPG/train.py @@ -0,0 +1,64 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +from DDPG.env import OUNoise + +def train(cfg, env, agent): + print('开始训练!') + print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}') + ou_noise = OUNoise(env.action_space) # 动作噪声 + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + ou_noise.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + action = ou_noise.get_action(action, i_step) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + agent.update() + state = next_state + if (i_ep+1)%10 == 0: + print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards, ma_rewards + +def test(cfg, env, agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.eval_eps): + state = env.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + state = next_state + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.eval_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + return rewards, ma_rewards \ No newline at end of file diff --git a/codes/DQN/agent.py b/codes/DQN/agent.py index 27845d2..2e1e5de 100644 --- a/codes/DQN/agent.py +++ b/codes/DQN/agent.py @@ -14,16 +14,57 @@ LastEditTime: 2021-09-15 13:35:36 import torch import torch.nn as nn +import torch.nn.functional as F import torch.optim as optim import random import math import numpy as np -from common.memory import ReplayBuffer -from common.model import MLP -class DQN: - def __init__(self, n_states, n_actions, cfg): - self.n_actions = n_actions # 总的动作个数 +class MLP(nn.Module): + def __init__(self, state_dim,action_dim,hidden_dim=128): + """ 初始化q网络,为全连接网络 + state_dim: 输入的特征数即环境的状态数 + action_dim: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) + +class DQN: + def __init__(self, state_dim, action_dim, cfg): + + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # 奖励的折扣因子 # e-greedy策略相关参数 @@ -32,8 +73,8 @@ class DQN: (cfg.epsilon_start - cfg.epsilon_end) * \ math.exp(-1. * frame_idx / cfg.epsilon_decay) self.batch_size = cfg.batch_size - self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 @@ -49,7 +90,7 @@ class DQN: q_values = self.policy_net(state) action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 diff --git a/codes/DQN/task0.py b/codes/DQN/task0.py new file mode 100644 index 0000000..e4c326e --- /dev/null +++ b/codes/DQN/task0.py @@ -0,0 +1,75 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +from common.utils import save_results, make_dir +from common.utils import plot_rewards +from DQN.agent import DQN +from DQN.train import train,test + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = "DQN" # 算法名称 +env_name = 'CartPole-v0' # 环境名称 + +class DQNConfig: + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.eval_eps = 30 # 测试的回合数 + # 超参数 + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.90 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 4 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + def __init__(self) -> None: + self.algo = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg, seed=1): + ''' 创建环境和智能体 + ''' + env = gym.make(cfg.env_name) # 创建环境 + env.seed(seed) # 设置随机种子 + state_dim = env.observation_space.shape[0] # 状态数 + action_dim = env.action_space.n # 动作数 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 + return env, agent + + +cfg = DQNConfig() +plot_cfg = PlotConfig() +# 训练 +env, agent = env_agent_config(cfg, seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 +agent.save(path=plot_cfg.model_path) # 保存模型 +save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 +# 测试 +env, agent = env_agent_config(cfg, seed=10) +agent.load(path=plot_cfg.model_path) # 导入模型 +rewards, ma_rewards = test(cfg, env, agent) +save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 +plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task1.py b/codes/DQN/task1.py new file mode 100644 index 0000000..d85a2ef --- /dev/null +++ b/codes/DQN/task1.py @@ -0,0 +1,83 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +from common.utils import save_results, make_dir +from common.utils import plot_rewards, plot_rewards_cn +from DQN.agent import DQN +from DQN.train import train,test + + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = "DQN" # 算法名称 +env_name = 'CartPole-v1' # 环境名称 +class DQNConfig: + ''' 算法相关参数设置 + ''' + + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.eval_eps = 30 # 测试的回合数 + # 超参数 + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.90 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 4 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + + +def env_agent_config(cfg, seed=1): + ''' 创建环境和智能体 + ''' + env = gym.make(cfg.env_name) # 创建环境 + env.seed(seed) # 设置随机种子 + state_dim = env.observation_space.shape[0] # 状态数 + action_dim = env.action_space.n # 动作数 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 + return env, agent + + +cfg = DQNConfig() +plot_cfg = PlotConfig() +# 训练 +env, agent = env_agent_config(cfg, seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 +agent.save(path=plot_cfg.model_path) # 保存模型 +save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 +plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 +# 测试 +env, agent = env_agent_config(cfg, seed=10) +agent.load(path=plot_cfg.model_path) # 导入模型 +rewards, ma_rewards = test(cfg, env, agent) +save_results(rewards, ma_rewards, tag='test', + path=plot_cfg.result_path) # 保存结果 +plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task0_train.ipynb b/codes/DQN/train.ipynb similarity index 98% rename from codes/DQN/task0_train.ipynb rename to codes/DQN/train.ipynb index 464e216..ba4308e 100644 --- a/codes/DQN/task0_train.ipynb +++ b/codes/DQN/train.ipynb @@ -38,15 +38,15 @@ "outputs": [], "source": [ "class MLP(nn.Module):\n", - " def __init__(self, n_states,n_actions,hidden_dim=128):\n", + " def __init__(self, state_dim,action_dim,hidden_dim=128):\n", " \"\"\" 初始化q网络,为全连接网络\n", - " n_states: 输入的特征数即环境的状态数\n", - " n_actions: 输出的动作维度\n", + " state_dim: 输入的特征数即环境的状态数\n", + " action_dim: 输出的动作维度\n", " \"\"\"\n", " super(MLP, self).__init__()\n", - " self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层\n", + " self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层\n", " self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层\n", - " self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层\n", + " self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层\n", " \n", " def forward(self, x):\n", " # 各层对应的激活函数\n", @@ -107,9 +107,9 @@ "outputs": [], "source": [ "class DQN:\n", - " def __init__(self, n_states, n_actions, cfg):\n", + " def __init__(self, state_dim, action_dim, cfg):\n", "\n", - " self.n_actions = n_actions # 总的动作个数\n", + " self.action_dim = action_dim # 总的动作个数\n", " self.device = cfg.device # 设备,cpu或gpu等\n", " self.gamma = cfg.gamma # 奖励的折扣因子\n", " # e-greedy策略相关参数\n", @@ -118,8 +118,8 @@ " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", " self.batch_size = cfg.batch_size\n", - " self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", - " self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net\n", " target_param.data.copy_(param.data)\n", " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", @@ -135,7 +135,7 @@ " q_values = self.policy_net(state)\n", " action = q_values.max(1)[1].item() # 选择Q值最大的动作\n", " else:\n", - " action = random.randrange(self.n_actions)\n", + " action = random.randrange(self.action_dim)\n", " return action\n", " def update(self):\n", " if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略\n", @@ -211,9 +211,9 @@ " '''\n", " env = gym.make(cfg.env) # 创建环境\n", " env.seed(seed) # 设置随机种子\n", - " n_states = env.observation_space.shape[0] # 状态数\n", - " n_actions = env.action_space.n # 动作数\n", - " agent = DQN(n_states,n_actions,cfg) # 创建智能体\n", + " state_dim = env.observation_space.shape[0] # 状态数\n", + " action_dim = env.action_space.n # 动作数\n", + " agent = DQN(state_dim,action_dim,cfg) # 创建智能体\n", " return env,agent" ] }, diff --git a/codes/DQN/task0_train.py b/codes/DQN/train.py similarity index 52% rename from codes/DQN/task0_train.py rename to codes/DQN/train.py index 5fd0ccd..4f8510e 100644 --- a/codes/DQN/task0_train.py +++ b/codes/DQN/train.py @@ -9,63 +9,11 @@ LastEditTime: 2021-09-15 15:34:13 @Discription: @Environment: python 3.7.7 ''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime - -from common.utils import save_results, make_dir -from common.plot import plot_rewards -from DQN.agent import DQN - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -class DQNConfig: - def __init__(self): - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 200 # 训练的回合数 - self.eval_eps = 30 # 测试的回合数 - # 超参数 - self.gamma = 0.95 # 强化学习中的折扣因子 - self.epsilon_start = 0.90 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 - self.lr = 0.0001 # 学习率 - self.memory_capacity = 100000 # 经验回放的容量 - self.batch_size = 64 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层 -class PlotConfig: - def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - -def env_agent_config(cfg,seed=1): - ''' 创建环境和智能体 - ''' - env = gym.make(cfg.env_name) # 创建环境 - env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态数 - n_actions = env.action_space.n # 动作数 - agent = DQN(n_states,n_actions,cfg) # 创建智能体 - return env,agent - def train(cfg, env, agent): ''' 训练 ''' print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): @@ -92,9 +40,9 @@ def train(cfg, env, agent): print('完成训练!') return rewards, ma_rewards -def eval(cfg,env,agent): +def test(cfg,env,agent): print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon @@ -115,11 +63,64 @@ def eval(cfg,env,agent): ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) else: ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}") + print(f"回合:{i_ep+1}/{cfg.eval_eps},奖励:{ep_reward:.1f}") print('完成测试!') return rewards,ma_rewards if __name__ == "__main__": + import sys,os + curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 + parent_path = os.path.dirname(curr_path) # 父路径 + sys.path.append(parent_path) # 添加路径到系统路径 + + import gym + import torch + import datetime + + from common.utils import save_results, make_dir + from common.utils import plot_rewards + from DQN.agent import DQN + from DQN.train import train + + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 + class DQNConfig: + def __init__(self): + self.algo = "DQN" # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.eval_eps = 30 # 测试的回合数 + # 超参数 + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.90 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 4 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 + class PlotConfig: + def __init__(self) -> None: + self.algo = "DQN" # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + + def env_agent_config(cfg,seed=1): + ''' 创建环境和智能体 + ''' + env = gym.make(cfg.env_name) # 创建环境 + env.seed(seed) # 设置随机种子 + state_dim = env.observation_space.shape[0] # 状态数 + action_dim = env.action_space.n # 动作数 + agent = DQN(state_dim,action_dim,cfg) # 创建智能体 + return env,agent + cfg = DQNConfig() plot_cfg = PlotConfig() # 训练 @@ -132,6 +133,6 @@ if __name__ == "__main__": # 测试 env,agent = env_agent_config(cfg,seed=10) agent.load(path=plot_cfg.model_path) # 导入模型 - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) # 保存结果 - plot_rewards(rewards,ma_rewards, plot_cfg, tag="eval") # 画出结果 + rewards,ma_rewards = test(cfg,env,agent) + save_results(rewards,ma_rewards,tag='test',path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards,ma_rewards, plot_cfg, tag="test") # 画出结果 \ No newline at end of file diff --git a/codes/Docs/使用DQN解决推车杆问题.md b/codes/Docs/使用DQN解决推车杆问题.md index 5889165..ac56ac6 100644 --- a/codes/Docs/使用DQN解决推车杆问题.md +++ b/codes/Docs/使用DQN解决推车杆问题.md @@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0 import gym env = gym.make('CartPole-v0') # 建立环境 env.seed(1) # 随机种子 -n_states = env.observation_space.shape[0] # 状态数 -n_actions = env.action_space.n # 动作数 +state_dim = env.observation_space.shape[0] # 状态数 +action_dim = env.action_space.n # 动作数 state = env.reset() # 初始化环境 -print(f"状态数:{n_states},动作数:{n_actions}") +print(f"状态数:{state_dim},动作数:{action_dim}") print(f"初始状态:{state}") ``` diff --git a/codes/Docs/使用Q-learning解决悬崖寻路问题.md b/codes/Docs/使用Q-learning解决悬崖寻路问题.md index 244d85b..ac25945 100644 --- a/codes/Docs/使用Q-learning解决悬崖寻路问题.md +++ b/codes/Docs/使用Q-learning解决悬崖寻路问题.md @@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境 这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作数目: ```python -n_states = env.observation_space.n # 状态数 -n_actions = env.action_space.n # 动作数 -print(f"状态数:{n_states},动作数:{n_actions}") +state_dim = env.observation_space.n # 状态数 +action_dim = env.action_space.n # 动作数 +print(f"状态数:{state_dim},动作数:{action_dim}") ``` 打印出来的结果如下: @@ -72,9 +72,9 @@ print(state) env = gym.make('CliffWalking-v0') # 定义环境 env = CliffWalkingWapper(env) # 装饰环境 env.seed(1) # 设置随机种子 -n_states = env.observation_space.n # 状态数 -n_actions = env.action_space.n # 动作数 -agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数 +state_dim = env.observation_space.n # 状态数 +action_dim = env.action_space.n # 动作数 +agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数 ep_reward = 0 # 记录每个回合的奖励 state = env.reset() # 重置环境 diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/agent.py index 1ade5f8..7b26fa1 100644 --- a/codes/DoubleDQN/agent.py +++ b/codes/DoubleDQN/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2021-05-04 22:28:06 +LastEditTime: 2021-11-19 18:07:09 @Discription: @Environment: python 3.7.7 ''' @@ -16,15 +16,55 @@ LastEditTime: 2021-05-04 22:28:06 import torch import torch.nn as nn import torch.optim as optim +import torch.nn.functional as F import random import math import numpy as np -from common.memory import ReplayBuffer -from common.model import MLP -class DoubleDQN: - def __init__(self, state_dim, action_dim, cfg): + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) + +class MLP(nn.Module): + def __init__(self, n_states,n_actions,hidden_dim=128): + """ 初始化q网络,为全连接网络 + n_states: 输入的特征数即环境的状态数 + n_actions: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 - self.action_dim = action_dim # 总的动作个数 + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + +class DoubleDQN: + def __init__(self, n_states, n_actions, cfg): + self.n_actions = n_actions # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # e-greedy策略相关参数 @@ -33,8 +73,8 @@ class DoubleDQN: self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay self.batch_size = cfg.batch_size - self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) # target_net copy from policy_net for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(param.data) @@ -43,8 +83,15 @@ class DoubleDQN: self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) self.loss = 0 self.memory = ReplayBuffer(cfg.memory_capacity) - def predict(self,state): - with torch.no_grad(): + + def choose_action(self, state): + '''选择动作 + ''' + self.actions_count += 1 + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ + math.exp(-1. * self.actions_count / self.epsilon_decay) + if random.random() > self.epsilon: + with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( @@ -55,17 +102,8 @@ class DoubleDQN: # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() - return action - def choose_action(self, state): - '''选择动作 - ''' - self.actions_count += 1 - self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ - math.exp(-1. * self.actions_count / self.epsilon_decay) - if random.random() > self.epsilon: - action = self.predict(state) else: - action = random.randrange(self.action_dim) + action = random.randrange(self.n_actions) return action def update(self): diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth deleted file mode 100644 index 8c4b561..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy deleted file mode 100644 index 0f77696..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy deleted file mode 100644 index 57f8759..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png deleted file mode 100644 index 038e031..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy deleted file mode 100644 index 63d10e7..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy deleted file mode 100644 index d486ad9..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png deleted file mode 100644 index f91bc4d..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/models/checkpoint.pth new file mode 100644 index 0000000..fc1ca66 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/models/checkpoint.pth differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_ma_rewards.npy new file mode 100644 index 0000000..b32c0a8 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards.npy new file mode 100644 index 0000000..9ccf4e9 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards_curve.png new file mode 100644 index 0000000..3580ef9 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards_curve.png differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_ma_rewards.npy new file mode 100644 index 0000000..b0838ab Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards.npy new file mode 100644 index 0000000..12e1347 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards_curve.png new file mode 100644 index 0000000..d612a6a Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards_curve.png differ diff --git a/codes/DoubleDQN/task0.py b/codes/DoubleDQN/task0.py new file mode 100644 index 0000000..4fe9579 --- /dev/null +++ b/codes/DoubleDQN/task0.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-11-07 18:10:37 +LastEditor: JiangJi +LastEditTime: 2021-11-19 18:34:05 +Discription: +''' + +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime + +from common.utils import save_results, make_dir +from common.utils import plot_rewards +from DoubleDQN.agent import DoubleDQN +from DoubleDQN.train import train,test + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'DoubleDQN' # 算法名称 +env_name = 'CartPole-v0' # 环境名称 +class DoubleDQNConfig: + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.test_eps = 30 # 测试的回合数 + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.95 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 2 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + n_states = env.observation_space.shape[0] + n_actions = env.action_space.n + agent = DoubleDQN(n_states,n_actions,cfg) + return env,agent + +cfg = DoubleDQNConfig() +plot_cfg = PlotConfig() +# 训练 +env,agent = env_agent_config(cfg,seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 +agent.save(path=plot_cfg.model_path) # 保存模型 +save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 +# 测试 +env,agent = env_agent_config(cfg,seed=10) +agent.load(path=plot_cfg.model_path) # 导入模型 +rewards,ma_rewards = test(cfg,env,agent) +save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 +plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/DoubleDQN/task0_train.ipynb b/codes/DoubleDQN/task0_train.ipynb deleted file mode 100644 index ee2e5d4..0000000 --- a/codes/DoubleDQN/task0_train.ipynb +++ /dev/null @@ -1,194 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3710jvsc74a57bd0366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232", - "display_name": "Python 3.7.10 64-bit ('py37': conda)" - }, - "metadata": { - "interpreter": { - "hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute())\n", - "parent_path = str(Path().absolute().parent)\n", - "sys.path.append(parent_path) # add current terminal path to sys.path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import torch\n", - "import datetime\n", - "from DoubleDQN.agent import DoubleDQN\n", - "from common.plot import plot_rewards\n", - "from common.utils import save_results, make_dir\n", - "\n", - "curr_time = datetime.datetime.now().strftime(\n", - " \"%Y%m%d-%H%M%S\") # obtain current time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DoubleDQNConfig:\n", - " def __init__(self):\n", - " self.algo = \"DoubleDQN\" # name of algo\n", - " self.env = 'CartPole-v0' # env name\n", - " self.result_path = curr_path+\"/outputs/\" + self.env + \\\n", - " '/'+curr_time+'/results/' # path to save results\n", - " self.model_path = curr_path+\"/outputs/\" + self.env + \\\n", - " '/'+curr_time+'/models/' # path to save models\n", - " self.train_eps = 200 # max tranng episodes\n", - " self.eval_eps = 50 # max evaling episodes\n", - " self.gamma = 0.95\n", - " self.epsilon_start = 1 # start epsilon of e-greedy policy\n", - " self.epsilon_end = 0.01 \n", - " self.epsilon_decay = 500\n", - " self.lr = 0.001 # learning rate\n", - " self.memory_capacity = 100000 # capacity of Replay Memory\n", - " self.batch_size = 64\n", - " self.target_update = 2 # update frequency of target net\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # check gpu\n", - " self.hidden_dim = 256 # hidden size of net" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def env_agent_config(cfg,seed=1):\n", - " env = gym.make(cfg.env) \n", - " env.seed(seed)\n", - " state_dim = env.observation_space.shape[0]\n", - " action_dim = env.action_space.n\n", - " agent = DoubleDQN(state_dim,action_dim,cfg)\n", - " return env,agent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def train(cfg,env,agent):\n", - " print('Start to train !')\n", - " rewards,ma_rewards = [],[]\n", - " for i_ep in range(cfg.train_eps):\n", - " state = env.reset() \n", - " ep_reward = 0\n", - " while True:\n", - " action = agent.choose_action(state) \n", - " next_state, reward, done, _ = env.step(action)\n", - " ep_reward += reward\n", - " agent.memory.push(state, action, reward, next_state, done) \n", - " state = next_state \n", - " agent.update() \n", - " if done:\n", - " break\n", - " if i_ep % cfg.target_update == 0:\n", - " agent.target_net.load_state_dict(agent.policy_net.state_dict())\n", - " if (i_ep+1)%10 == 0:\n", - " print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}')\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(\n", - " 0.9*ma_rewards[-1]+0.1*ep_reward)\n", - " else:\n", - " ma_rewards.append(ep_reward) \n", - " print('Complete training!')\n", - " return rewards,ma_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def eval(cfg,env,agent):\n", - " print('Start to eval !')\n", - " print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n", - " rewards = [] \n", - " ma_rewards = []\n", - " for i_ep in range(cfg.eval_eps):\n", - " state = env.reset() \n", - " ep_reward = 0 \n", - " while True:\n", - " action = agent.predict(state) \n", - " next_state, reward, done, _ = env.step(action) \n", - " state = next_state \n", - " ep_reward += reward\n", - " if done:\n", - " break\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " print(f\"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}\")\n", - " print('Complete evaling!')\n", - " return rewards,ma_rewards " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if __name__ == \"__main__\":\n", - " cfg = DoubleDQNConfig()\n", - " # train\n", - " env,agent = env_agent_config(cfg,seed=1)\n", - " rewards, ma_rewards = train(cfg, env, agent)\n", - " make_dir(cfg.result_path, cfg.model_path)\n", - " agent.save(path=cfg.model_path)\n", - " save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)\n", - " plot_rewards(rewards, ma_rewards, tag=\"train\",\n", - " algo=cfg.algo, path=cfg.result_path)\n", - "\n", - " # eval\n", - " env,agent = env_agent_config(cfg,seed=10)\n", - " agent.load(path=cfg.model_path)\n", - " rewards,ma_rewards = eval(cfg,env,agent)\n", - " save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)\n", - " plot_rewards(rewards,ma_rewards,tag=\"eval\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)" - ] - } - ] -} \ No newline at end of file diff --git a/codes/DoubleDQN/task0_train.py b/codes/DoubleDQN/task0_train.py deleted file mode 100644 index 0148ea2..0000000 --- a/codes/DoubleDQN/task0_train.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-12 00:48:57 -@LastEditor: John -LastEditTime: 2021-09-10 15:26:05 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import gym -import torch -import datetime -from DoubleDQN.agent import DoubleDQN -from common.plot import plot_rewards -from common.utils import save_results, make_dir - -curr_time = datetime.datetime.now().strftime( - "%Y%m%d-%H%M%S") # obtain current time - -class DoubleDQNConfig: - def __init__(self): - self.algo = "DoubleDQN" # name of algo - self.env = 'CartPole-v0' # env name - self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results - self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save models - self.train_eps = 200 # max tranng episodes - self.eval_eps = 50 # max evaling episodes - self.gamma = 0.95 - self.epsilon_start = 1 # start epsilon of e-greedy policy - self.epsilon_end = 0.01 - self.epsilon_decay = 500 - self.lr = 0.001 # learning rate - self.memory_capacity = 100000 # capacity of Replay Memory - self.batch_size = 64 - self.target_update = 2 # update frequency of target net - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check gpu - self.hidden_dim = 256 # hidden size of net - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env) - env.seed(seed) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = DoubleDQN(state_dim,action_dim,cfg) - return env,agent - -def train(cfg,env,agent): - print('Start to train !') - rewards,ma_rewards = [],[] - for i_ep in range(cfg.train_eps): - state = env.reset() - ep_reward = 0 - while True: - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() - if done: - break - if i_ep % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward},Epsilon:{agent.epsilon:.2f}') - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards,ma_rewards - -def eval(cfg,env,agent): - print('Start to eval !') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] - for i_ep in range(cfg.eval_eps): - state = env.reset() - ep_reward = 0 - while True: - action = agent.predict(state) - next_state, reward, done, _ = env.step(action) - state = next_state - ep_reward += reward - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}") - print('Complete evaling!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = DoubleDQNConfig() - # 训练 - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) - - # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) diff --git a/codes/DoubleDQN/train.py b/codes/DoubleDQN/train.py new file mode 100644 index 0000000..ff0a786 --- /dev/null +++ b/codes/DoubleDQN/train.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-11-07 18:10:37 +LastEditor: JiangJi +LastEditTime: 2021-11-19 18:34:05 +Discription: +''' + +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +def train(cfg,env,agent): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + state = next_state + agent.update() + if done: + break + if i_ep % cfg.target_update == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward}') + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards,ma_rewards + +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.test_eps): + state = env.reset() + ep_reward = 0 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + return rewards,ma_rewards + diff --git a/codes/DuelingDQN/task0_train.ipynb b/codes/DuelingDQN/task0_train.ipynb index c2cd1c3..7e38218 100644 --- a/codes/DuelingDQN/task0_train.ipynb +++ b/codes/DuelingDQN/task0_train.ipynb @@ -136,12 +136,12 @@ "outputs": [], "source": [ "class DuelingNet(nn.Module):\n", - " def __init__(self, n_states, n_actions,hidden_size=128):\n", + " def __init__(self, state_dim, action_dim,hidden_size=128):\n", " super(DuelingNet, self).__init__()\n", " \n", " # 隐藏层\n", " self.hidden = nn.Sequential(\n", - " nn.Linear(n_states, hidden_size),\n", + " nn.Linear(state_dim, hidden_size),\n", " nn.ReLU()\n", " )\n", " \n", @@ -149,7 +149,7 @@ " self.advantage = nn.Sequential(\n", " nn.Linear(hidden_size, hidden_size),\n", " nn.ReLU(),\n", - " nn.Linear(hidden_size, n_actions)\n", + " nn.Linear(hidden_size, action_dim)\n", " )\n", " \n", " # 价值函数\n", @@ -192,7 +192,7 @@ ], "source": [ "class DuelingDQN:\n", - " def __init__(self,n_states,n_actions,cfg) -> None:\n", + " def __init__(self,state_dim,action_dim,cfg) -> None:\n", " self.batch_size = cfg.batch_size\n", " self.device = cfg.device\n", " self.loss_history = [] # 记录loss的变化\n", @@ -200,8 +200,8 @@ " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", - " self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", - " self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n", " target_param.data.copy_(param.data)\n", " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", diff --git a/codes/HierarchicalDQN/agent.py b/codes/HierarchicalDQN/agent.py index 3760643..62c539a 100644 --- a/codes/HierarchicalDQN/agent.py +++ b/codes/HierarchicalDQN/agent.py @@ -11,23 +11,62 @@ Environment: ''' import torch import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F import numpy as np import random,math -import torch.optim as optim -from common.model import MLP -from common.memory import ReplayBuffer +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) +class MLP(nn.Module): + def __init__(self, input_dim,output_dim,hidden_dim=128): + """ 初始化q网络,为全连接网络 + input_dim: 输入的特征数即环境的状态数 + output_dim: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + class HierarchicalDQN: - def __init__(self,state_dim,action_dim,cfg): - self.state_dim = state_dim - self.action_dim = action_dim + def __init__(self,n_states,n_actions,cfg): + self.n_states = n_states + self.n_actions = n_actions self.gamma = cfg.gamma self.device = cfg.device self.batch_size = cfg.batch_size - self.frame_idx = 0 + self.frame_idx = 0 # 用于epsilon的衰减计数 self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay) - self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) - self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device) + self.policy_net = MLP(2*n_states, n_actions,cfg.hidden_dim).to(self.device) + self.meta_policy_net = MLP(n_states, n_states,cfg.hidden_dim).to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr) self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr) self.memory = ReplayBuffer(cfg.memory_capacity) @@ -37,7 +76,7 @@ class HierarchicalDQN: self.losses = [] self.meta_losses = [] def to_onehot(self,x): - oh = np.zeros(self.state_dim) + oh = np.zeros(self.n_states) oh[x - 1] = 1. return oh def set_goal(self,state): @@ -46,7 +85,7 @@ class HierarchicalDQN: state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0) goal = self.meta_policy_net(state).max(1)[1].item() else: - goal = random.randrange(self.state_dim) + goal = random.randrange(self.n_states) return goal def choose_action(self,state): self.frame_idx += 1 @@ -56,7 +95,7 @@ class HierarchicalDQN: q_value = self.policy_net(state) action = q_value.max(1)[1].item() else: - action = random.randrange(self.action_dim) + action = random.randrange(self.n_actions) return action def update(self): self.update_policy() diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/meta_checkpoint.pth b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/meta_checkpoint.pth new file mode 100644 index 0000000..02f3f7c Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/meta_checkpoint.pth differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/policy_checkpoint.pth b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/policy_checkpoint.pth new file mode 100644 index 0000000..9d906ea Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/policy_checkpoint.pth differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_ma_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_ma_rewards.npy new file mode 100644 index 0000000..14dd955 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_ma_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards.npy new file mode 100644 index 0000000..e815222 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards_curve.png b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards_curve.png new file mode 100644 index 0000000..645b21a Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards_curve.png differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_ma_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_ma_rewards.npy new file mode 100644 index 0000000..bf58391 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_ma_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards.npy new file mode 100644 index 0000000..f4d20ff Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards_curve.png b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards_curve.png new file mode 100644 index 0000000..20ccbc5 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards_curve.png differ diff --git a/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy b/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy deleted file mode 100644 index daab87d..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png b/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png deleted file mode 100644 index 77555ad..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy b/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy deleted file mode 100644 index 5a1ad82..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png b/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png deleted file mode 100644 index 4f962ea..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy b/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy deleted file mode 100644 index 523bdb4..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png b/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png deleted file mode 100644 index 97443e5..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy b/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy deleted file mode 100644 index 99cf87a..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth deleted file mode 100644 index 873b3ef..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth deleted file mode 100644 index be8ea8a..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth deleted file mode 100644 index e3f7c38..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth deleted file mode 100644 index 6be6ea3..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/task0.py b/codes/HierarchicalDQN/task0.py new file mode 100644 index 0000000..b2cf312 --- /dev/null +++ b/codes/HierarchicalDQN/task0.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-29 10:37:32 +LastEditor: John +LastEditTime: 2021-05-04 22:35:56 +Discription: +Environment: +''' +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import datetime +import numpy as np +import torch +import gym + +from common.utils import save_results,make_dir +from common.utils import plot_rewards +from HierarchicalDQN.agent import HierarchicalDQN +from HierarchicalDQN.train import train,test + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = "Hierarchical DQN" # 算法名称 +env_name = 'CartPole-v0' # 环境名称 +class HierarchicalDQNConfig: + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 300 # 训练的episode数目 + self.test_eps = 50 # 测试的episode数目 + self.gamma = 0.99 + self.epsilon_start = 1 # start epsilon of e-greedy policy + self.epsilon_end = 0.01 + self.epsilon_decay = 200 + self.lr = 0.0001 # learning rate + self.memory_capacity = 10000 # Replay Memory capacity + self.batch_size = 32 + self.target_update = 2 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + n_states = env.observation_space.shape[0] + n_actions = env.action_space.n + agent = HierarchicalDQN(n_states,n_actions,cfg) + return env,agent + +if __name__ == "__main__": + cfg = HierarchicalDQNConfig() + plot_cfg = PlotConfig() + # 训练 + env, agent = env_agent_config(cfg, seed=1) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=plot_cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg, seed=10) + agent.load(path=plot_cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 + diff --git a/codes/HierarchicalDQN/task0_train.ipynb b/codes/HierarchicalDQN/task0_train.ipynb deleted file mode 100644 index c63e950..0000000 --- a/codes/HierarchicalDQN/task0_train.ipynb +++ /dev/null @@ -1,477 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.7.10 64-bit ('py37': conda)", - "metadata": { - "interpreter": { - "hash": "fbea1422c2cf61ed9c0cfc03f38f71cc9083cc288606edc4170b5309b352ce27" - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys,os\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute())\n", - "parent_path = str(Path().absolute().parent)\n", - "sys.path.append(parent_path) # add current terminal path to sys.path\n", - "\n", - "import gym\n", - "import torch\n", - "import numpy as np\n", - "import datetime\n", - "\n", - "from HierarchicalDQN.agent import HierarchicalDQN\n", - "from common.plot import plot_rewards\n", - "from common.utils import save_results" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "SEQUENCE = datetime.datetime.now().strftime(\n", - " \"%Y%m%d-%H%M%S\") # obtain current time\n", - "SAVED_MODEL_PATH = curr_path+\"/saved_model/\"+SEQUENCE+'/' # path to save model\n", - "if not os.path.exists(curr_path+\"/saved_model/\"):\n", - " os.mkdir(curr_path+\"/saved_model/\")\n", - "if not os.path.exists(SAVED_MODEL_PATH):\n", - " os.mkdir(SAVED_MODEL_PATH)\n", - "RESULT_PATH = curr_path+\"/results/\"+SEQUENCE+'/' # path to save rewards\n", - "if not os.path.exists(curr_path+\"/results/\"):\n", - " os.mkdir(curr_path+\"/results/\")\n", - "if not os.path.exists(RESULT_PATH):\n", - " os.mkdir(RESULT_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "class HierarchicalDQNConfig:\n", - " def __init__(self):\n", - " self.algo = \"H-DQN\" # name of algo\n", - " self.gamma = 0.95\n", - " self.epsilon_start = 1 # start epsilon of e-greedy policy\n", - " self.epsilon_end = 0.01\n", - " self.epsilon_decay = 500\n", - " self.lr = 0.0001 # learning rate\n", - " self.memory_capacity = 20000 # Replay Memory capacity\n", - " self.batch_size = 64\n", - " self.train_eps = 300 # 训练的episode数目\n", - " self.target_update = 2 # target net的更新频率\n", - " self.eval_eps = 20 # 测试的episode数目\n", - " self.device = torch.device(\n", - " \"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测gpu\n", - " self.hidden_dim = 256 # dimension of hidden layer" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def train(cfg, env, agent):\n", - " print('Start to train !')\n", - " rewards = []\n", - " ma_rewards = [] # moveing average reward\n", - " for i_episode in range(cfg.train_eps):\n", - " state = env.reset()\n", - " done = False\n", - " ep_reward = 0\n", - " while not done:\n", - " goal = agent.set_goal(state)\n", - " onehot_goal = agent.to_onehot(goal)\n", - " meta_state = state\n", - " extrinsic_reward = 0\n", - " while not done and goal != np.argmax(state):\n", - " goal_state = np.concatenate([state, onehot_goal])\n", - " action = agent.choose_action(goal_state)\n", - " next_state, reward, done, _ = env.step(action)\n", - " ep_reward += reward\n", - " extrinsic_reward += reward\n", - " intrinsic_reward = 1.0 if goal == np.argmax(\n", - " next_state) else 0.0\n", - " agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(\n", - " [next_state, onehot_goal]), done)\n", - " state = next_state\n", - " agent.update()\n", - " agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)\n", - " print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward))\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(\n", - " 0.9*ma_rewards[-1]+0.1*ep_reward)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " print('Complete training!')\n", - " return rewards, ma_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Start to train !\n", - "Episode:1/300, Reward:25.0\n", - "Episode:2/300, Reward:26.0\n", - "Episode:3/300, Reward:23.0\n", - "Episode:4/300, Reward:19.0\n", - "Episode:5/300, Reward:23.0\n", - "Episode:6/300, Reward:21.0\n", - "Episode:7/300, Reward:21.0\n", - "Episode:8/300, Reward:22.0\n", - "Episode:9/300, Reward:15.0\n", - "Episode:10/300, Reward:12.0\n", - "Episode:11/300, Reward:39.0\n", - "Episode:12/300, Reward:42.0\n", - "Episode:13/300, Reward:79.0\n", - "Episode:14/300, Reward:54.0\n", - "Episode:15/300, Reward:28.0\n", - "Episode:16/300, Reward:85.0\n", - "Episode:17/300, Reward:46.0\n", - "Episode:18/300, Reward:37.0\n", - "Episode:19/300, Reward:45.0\n", - "Episode:20/300, Reward:79.0\n", - "Episode:21/300, Reward:80.0\n", - "Episode:22/300, Reward:154.0\n", - "Episode:23/300, Reward:74.0\n", - "Episode:24/300, Reward:129.0\n", - "Episode:25/300, Reward:185.0\n", - "Episode:26/300, Reward:200.0\n", - "Episode:27/300, Reward:115.0\n", - "Episode:28/300, Reward:104.0\n", - "Episode:29/300, Reward:200.0\n", - "Episode:30/300, Reward:118.0\n", - "Episode:31/300, Reward:200.0\n", - "Episode:32/300, Reward:200.0\n", - "Episode:33/300, Reward:83.0\n", - "Episode:34/300, Reward:75.0\n", - "Episode:35/300, Reward:46.0\n", - "Episode:36/300, Reward:96.0\n", - "Episode:37/300, Reward:78.0\n", - "Episode:38/300, Reward:150.0\n", - "Episode:39/300, Reward:147.0\n", - "Episode:40/300, Reward:74.0\n", - "Episode:41/300, Reward:137.0\n", - "Episode:42/300, Reward:182.0\n", - "Episode:43/300, Reward:200.0\n", - "Episode:44/300, Reward:200.0\n", - "Episode:45/300, Reward:200.0\n", - "Episode:46/300, Reward:184.0\n", - "Episode:47/300, Reward:200.0\n", - "Episode:48/300, Reward:200.0\n", - "Episode:49/300, Reward:200.0\n", - "Episode:50/300, Reward:61.0\n", - "Episode:51/300, Reward:9.0\n", - "Episode:52/300, Reward:9.0\n", - "Episode:53/300, Reward:200.0\n", - "Episode:54/300, Reward:200.0\n", - "Episode:55/300, Reward:200.0\n", - "Episode:56/300, Reward:200.0\n", - "Episode:57/300, Reward:200.0\n", - "Episode:58/300, Reward:200.0\n", - "Episode:59/300, Reward:200.0\n", - "Episode:60/300, Reward:167.0\n", - "Episode:61/300, Reward:200.0\n", - "Episode:62/300, Reward:200.0\n", - "Episode:63/300, Reward:200.0\n", - "Episode:64/300, Reward:200.0\n", - "Episode:65/300, Reward:200.0\n", - "Episode:66/300, Reward:200.0\n", - "Episode:67/300, Reward:200.0\n", - "Episode:68/300, Reward:200.0\n", - "Episode:69/300, Reward:197.0\n", - "Episode:70/300, Reward:200.0\n", - "Episode:71/300, Reward:200.0\n", - "Episode:72/300, Reward:200.0\n", - "Episode:73/300, Reward:200.0\n", - "Episode:74/300, Reward:200.0\n", - "Episode:75/300, Reward:200.0\n", - "Episode:76/300, Reward:200.0\n", - "Episode:77/300, Reward:200.0\n", - "Episode:78/300, Reward:200.0\n", - "Episode:79/300, Reward:200.0\n", - "Episode:80/300, Reward:200.0\n", - "Episode:81/300, Reward:181.0\n", - "Episode:82/300, Reward:200.0\n", - "Episode:83/300, Reward:200.0\n", - "Episode:84/300, Reward:200.0\n", - "Episode:85/300, Reward:200.0\n", - "Episode:86/300, Reward:200.0\n", - "Episode:87/300, Reward:200.0\n", - "Episode:88/300, Reward:200.0\n", - "Episode:89/300, Reward:200.0\n", - "Episode:90/300, Reward:200.0\n", - "Episode:91/300, Reward:200.0\n", - "Episode:92/300, Reward:200.0\n", - "Episode:93/300, Reward:200.0\n", - "Episode:94/300, Reward:200.0\n", - "Episode:95/300, Reward:200.0\n", - "Episode:96/300, Reward:200.0\n", - "Episode:97/300, Reward:200.0\n", - "Episode:98/300, Reward:200.0\n", - "Episode:99/300, Reward:192.0\n", - "Episode:100/300, Reward:183.0\n", - "Episode:101/300, Reward:200.0\n", - "Episode:102/300, Reward:200.0\n", - "Episode:103/300, Reward:200.0\n", - "Episode:104/300, Reward:200.0\n", - "Episode:105/300, Reward:200.0\n", - "Episode:106/300, Reward:200.0\n", - "Episode:107/300, Reward:200.0\n", - "Episode:108/300, Reward:200.0\n", - "Episode:109/300, Reward:200.0\n", - "Episode:110/300, Reward:200.0\n", - "Episode:111/300, Reward:200.0\n", - "Episode:112/300, Reward:200.0\n", - "Episode:113/300, Reward:200.0\n", - "Episode:114/300, Reward:200.0\n", - "Episode:115/300, Reward:200.0\n", - "Episode:116/300, Reward:200.0\n", - "Episode:117/300, Reward:200.0\n", - "Episode:118/300, Reward:200.0\n", - "Episode:119/300, Reward:200.0\n", - "Episode:120/300, Reward:196.0\n", - "Episode:121/300, Reward:200.0\n", - "Episode:122/300, Reward:200.0\n", - "Episode:123/300, Reward:200.0\n", - "Episode:124/300, Reward:200.0\n", - "Episode:125/300, Reward:200.0\n", - "Episode:126/300, Reward:189.0\n", - "Episode:127/300, Reward:193.0\n", - "Episode:128/300, Reward:200.0\n", - "Episode:129/300, Reward:200.0\n", - "Episode:130/300, Reward:193.0\n", - "Episode:131/300, Reward:183.0\n", - "Episode:132/300, Reward:183.0\n", - "Episode:133/300, Reward:200.0\n", - "Episode:134/300, Reward:200.0\n", - "Episode:135/300, Reward:200.0\n", - "Episode:136/300, Reward:200.0\n", - "Episode:137/300, Reward:200.0\n", - "Episode:138/300, Reward:200.0\n", - "Episode:139/300, Reward:100.0\n", - "Episode:140/300, Reward:118.0\n", - "Episode:141/300, Reward:99.0\n", - "Episode:142/300, Reward:185.0\n", - "Episode:143/300, Reward:41.0\n", - "Episode:144/300, Reward:11.0\n", - "Episode:145/300, Reward:9.0\n", - "Episode:146/300, Reward:152.0\n", - "Episode:147/300, Reward:155.0\n", - "Episode:148/300, Reward:181.0\n", - "Episode:149/300, Reward:197.0\n", - "Episode:150/300, Reward:200.0\n", - "Episode:151/300, Reward:200.0\n", - "Episode:152/300, Reward:200.0\n", - "Episode:153/300, Reward:200.0\n", - "Episode:154/300, Reward:200.0\n", - "Episode:155/300, Reward:200.0\n", - "Episode:156/300, Reward:123.0\n", - "Episode:157/300, Reward:11.0\n", - "Episode:158/300, Reward:8.0\n", - "Episode:159/300, Reward:9.0\n", - "Episode:160/300, Reward:10.0\n", - "Episode:161/300, Reward:9.0\n", - "Episode:162/300, Reward:10.0\n", - "Episode:163/300, Reward:9.0\n", - "Episode:164/300, Reward:9.0\n", - "Episode:165/300, Reward:10.0\n", - "Episode:166/300, Reward:9.0\n", - "Episode:167/300, Reward:9.0\n", - "Episode:168/300, Reward:9.0\n", - "Episode:169/300, Reward:9.0\n", - "Episode:170/300, Reward:10.0\n", - "Episode:171/300, Reward:9.0\n", - "Episode:172/300, Reward:9.0\n", - "Episode:173/300, Reward:11.0\n", - "Episode:174/300, Reward:11.0\n", - "Episode:175/300, Reward:10.0\n", - "Episode:176/300, Reward:9.0\n", - "Episode:177/300, Reward:10.0\n", - "Episode:178/300, Reward:8.0\n", - "Episode:179/300, Reward:9.0\n", - "Episode:180/300, Reward:9.0\n", - "Episode:181/300, Reward:10.0\n", - "Episode:182/300, Reward:10.0\n", - "Episode:183/300, Reward:9.0\n", - "Episode:184/300, Reward:10.0\n", - "Episode:185/300, Reward:10.0\n", - "Episode:186/300, Reward:13.0\n", - "Episode:187/300, Reward:16.0\n", - "Episode:188/300, Reward:117.0\n", - "Episode:189/300, Reward:13.0\n", - "Episode:190/300, Reward:16.0\n", - "Episode:191/300, Reward:11.0\n", - "Episode:192/300, Reward:11.0\n", - "Episode:193/300, Reward:13.0\n", - "Episode:194/300, Reward:13.0\n", - "Episode:195/300, Reward:9.0\n", - "Episode:196/300, Reward:20.0\n", - "Episode:197/300, Reward:12.0\n", - "Episode:198/300, Reward:10.0\n", - "Episode:199/300, Reward:14.0\n", - "Episode:200/300, Reward:12.0\n", - "Episode:201/300, Reward:14.0\n", - "Episode:202/300, Reward:12.0\n", - "Episode:203/300, Reward:11.0\n", - "Episode:204/300, Reward:10.0\n", - "Episode:205/300, Reward:13.0\n", - "Episode:206/300, Reward:10.0\n", - "Episode:207/300, Reward:10.0\n", - "Episode:208/300, Reward:13.0\n", - "Episode:209/300, Reward:9.0\n", - "Episode:210/300, Reward:11.0\n", - "Episode:211/300, Reward:14.0\n", - "Episode:212/300, Reward:10.0\n", - "Episode:213/300, Reward:20.0\n", - "Episode:214/300, Reward:12.0\n", - "Episode:215/300, Reward:13.0\n", - "Episode:216/300, Reward:17.0\n", - "Episode:217/300, Reward:17.0\n", - "Episode:218/300, Reward:11.0\n", - "Episode:219/300, Reward:15.0\n", - "Episode:220/300, Reward:26.0\n", - "Episode:221/300, Reward:73.0\n", - "Episode:222/300, Reward:44.0\n", - "Episode:223/300, Reward:48.0\n", - "Episode:224/300, Reward:102.0\n", - "Episode:225/300, Reward:162.0\n", - "Episode:226/300, Reward:123.0\n", - "Episode:227/300, Reward:200.0\n", - "Episode:228/300, Reward:200.0\n", - "Episode:229/300, Reward:120.0\n", - "Episode:230/300, Reward:173.0\n", - "Episode:231/300, Reward:138.0\n", - "Episode:232/300, Reward:106.0\n", - "Episode:233/300, Reward:193.0\n", - "Episode:234/300, Reward:117.0\n", - "Episode:235/300, Reward:120.0\n", - "Episode:236/300, Reward:98.0\n", - "Episode:237/300, Reward:98.0\n", - "Episode:238/300, Reward:200.0\n", - "Episode:239/300, Reward:96.0\n", - "Episode:240/300, Reward:170.0\n", - "Episode:241/300, Reward:107.0\n", - "Episode:242/300, Reward:107.0\n", - "Episode:243/300, Reward:200.0\n", - "Episode:244/300, Reward:128.0\n", - "Episode:245/300, Reward:165.0\n", - "Episode:246/300, Reward:168.0\n", - "Episode:247/300, Reward:200.0\n", - "Episode:248/300, Reward:200.0\n", - "Episode:249/300, Reward:200.0\n", - "Episode:250/300, Reward:200.0\n", - "Episode:251/300, Reward:200.0\n", - "Episode:252/300, Reward:200.0\n", - "Episode:253/300, Reward:200.0\n", - "Episode:254/300, Reward:200.0\n", - "Episode:255/300, Reward:200.0\n", - "Episode:256/300, Reward:200.0\n", - "Episode:257/300, Reward:164.0\n", - "Episode:258/300, Reward:200.0\n", - "Episode:259/300, Reward:190.0\n", - "Episode:260/300, Reward:185.0\n", - "Episode:261/300, Reward:200.0\n", - "Episode:262/300, Reward:200.0\n", - "Episode:263/300, Reward:200.0\n", - "Episode:264/300, Reward:200.0\n", - "Episode:265/300, Reward:168.0\n", - "Episode:266/300, Reward:200.0\n", - "Episode:267/300, Reward:200.0\n", - "Episode:268/300, Reward:200.0\n", - "Episode:269/300, Reward:200.0\n", - "Episode:270/300, Reward:200.0\n", - "Episode:271/300, Reward:200.0\n", - "Episode:272/300, Reward:200.0\n", - "Episode:273/300, Reward:200.0\n", - "Episode:274/300, Reward:200.0\n", - "Episode:275/300, Reward:188.0\n", - "Episode:276/300, Reward:200.0\n", - "Episode:277/300, Reward:177.0\n", - "Episode:278/300, Reward:200.0\n", - "Episode:279/300, Reward:200.0\n", - "Episode:280/300, Reward:200.0\n", - "Episode:281/300, Reward:200.0\n", - "Episode:282/300, Reward:200.0\n", - "Episode:283/300, Reward:200.0\n", - "Episode:284/300, Reward:189.0\n", - "Episode:285/300, Reward:200.0\n", - "Episode:286/300, Reward:200.0\n", - "Episode:287/300, Reward:200.0\n", - "Episode:288/300, Reward:200.0\n", - "Episode:289/300, Reward:200.0\n", - "Episode:290/300, Reward:200.0\n", - "Episode:291/300, Reward:200.0\n", - "Episode:292/300, Reward:200.0\n", - "Episode:293/300, Reward:200.0\n", - "Episode:294/300, Reward:200.0\n", - "Episode:295/300, Reward:200.0\n", - "Episode:296/300, Reward:200.0\n", - "Episode:297/300, Reward:200.0\n", - "Episode:298/300, Reward:200.0\n", - "Episode:299/300, Reward:200.0\n", - "Episode:300/300, Reward:200.0\n", - "Complete training!\n", - "results saved!\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": "
", - "image/svg+xml": "\n\n\n\n \n \n \n \n 2021-03-31T14:01:15.395751\n image/svg+xml\n \n \n Matplotlib v3.3.4, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "image/png": "\n" - }, - "metadata": {} - } - ], - "source": [ - "env = gym.make('CartPole-v0')\n", - "env.seed(1)\n", - "cfg = HierarchicalDQNConfig()\n", - "state_dim = env.observation_space.shape[0]\n", - "action_dim = env.action_space.n\n", - "agent = HierarchicalDQN(state_dim, action_dim, cfg)\n", - "rewards, ma_rewards = train(cfg, env, agent)\n", - "agent.save(path=SAVED_MODEL_PATH)\n", - "save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)\n", - "plot_rewards(rewards, ma_rewards, tag=\"train\",\n", - " algo=cfg.algo, path=RESULT_PATH)" - ] - } - ] -} \ No newline at end of file diff --git a/codes/HierarchicalDQN/task0_train.py b/codes/HierarchicalDQN/task0_train.py deleted file mode 100644 index 2676094..0000000 --- a/codes/HierarchicalDQN/task0_train.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2021-03-29 10:37:32 -LastEditor: John -LastEditTime: 2021-05-04 22:35:56 -Discription: -Environment: -''' - - -import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import datetime -import numpy as np -import torch -import gym - -from common.utils import save_results,make_dir -from common.plot import plot_rewards -from HierarchicalDQN.agent import HierarchicalDQN - -curr_time = datetime.datetime.now().strftime( - "%Y%m%d-%H%M%S") # obtain current time - -class HierarchicalDQNConfig: - def __init__(self): - self.algo = "H-DQN" # name of algo - self.env = 'CartPole-v0' - self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results - self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save models - self.train_eps = 300 # 训练的episode数目 - self.eval_eps = 50 # 测试的episode数目 - self.gamma = 0.99 - self.epsilon_start = 1 # start epsilon of e-greedy policy - self.epsilon_end = 0.01 - self.epsilon_decay = 200 - self.lr = 0.0001 # learning rate - self.memory_capacity = 10000 # Replay Memory capacity - self.batch_size = 32 - self.target_update = 2 # target net的更新频率 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - self.hidden_dim = 256 # dimension of hidden layer - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env) - env.seed(seed) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = HierarchicalDQN(state_dim,action_dim,cfg) - return env,agent - -def train(cfg, env, agent): - print('Start to train !') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward - for i_ep in range(cfg.train_eps): - state = env.reset() - done = False - ep_reward = 0 - while not done: - goal = agent.set_goal(state) - onehot_goal = agent.to_onehot(goal) - meta_state = state - extrinsic_reward = 0 - while not done and goal != np.argmax(state): - goal_state = np.concatenate([state, onehot_goal]) - action = agent.choose_action(goal_state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - extrinsic_reward += reward - intrinsic_reward = 1.0 if goal == np.argmax( - next_state) else 0.0 - agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate( - [next_state, onehot_goal]), done) - state = next_state - agent.update() - agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) - print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy )) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards, ma_rewards - -def eval(cfg, env, agent): - print('Start to eval !') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward - for i_ep in range(cfg.train_eps): - state = env.reset() - done = False - ep_reward = 0 - while not done: - goal = agent.set_goal(state) - onehot_goal = agent.to_onehot(goal) - extrinsic_reward = 0 - while not done and goal != np.argmax(state): - goal_state = np.concatenate([state, onehot_goal]) - action = agent.choose_action(goal_state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - extrinsic_reward += reward - state = next_state - agent.update() - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}, Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}') - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards, ma_rewards - -if __name__ == "__main__": - cfg = HierarchicalDQNConfig() - - # train - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) - # eval - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) - diff --git a/codes/HierarchicalDQN/train.py b/codes/HierarchicalDQN/train.py new file mode 100644 index 0000000..3dc8aa3 --- /dev/null +++ b/codes/HierarchicalDQN/train.py @@ -0,0 +1,77 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import numpy as np + +def train(cfg, env, agent): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + done = False + ep_reward = 0 + while not done: + goal = agent.set_goal(state) + onehot_goal = agent.to_onehot(goal) + meta_state = state + extrinsic_reward = 0 + while not done and goal != np.argmax(state): + goal_state = np.concatenate([state, onehot_goal]) + action = agent.choose_action(goal_state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + extrinsic_reward += reward + intrinsic_reward = 1.0 if goal == np.argmax( + next_state) else 0.0 + agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate( + [next_state, onehot_goal]), done) + state = next_state + agent.update() + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward},Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}') + agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards, ma_rewards + +def test(cfg, env, agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + done = False + ep_reward = 0 + while not done: + goal = agent.set_goal(state) + onehot_goal = agent.to_onehot(goal) + extrinsic_reward = 0 + while not done and goal != np.argmax(state): + goal_state = np.concatenate([state, onehot_goal]) + action = agent.choose_action(goal_state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + extrinsic_reward += reward + state = next_state + agent.update() + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward},Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}') + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards, ma_rewards \ No newline at end of file diff --git a/codes/QLearning/task0_train.py b/codes/QLearning/task0_train.py index 6e616ab..2a9e0ea 100644 --- a/codes/QLearning/task0_train.py +++ b/codes/QLearning/task0_train.py @@ -45,9 +45,9 @@ def env_agent_config(cfg,seed=1): env = gym.make(cfg.env) env = CliffWalkingWapper(env) env.seed(seed) # 设置随机种子 - n_states = env.observation_space.n # 状态维度 - n_actions = env.action_space.n # 动作维度 - agent = QLearning(n_states,n_actions,cfg) + state_dim = env.observation_space.n # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = QLearning(state_dim,action_dim,cfg) return env,agent def train(cfg,env,agent): diff --git a/codes/README.md b/codes/README.md index fdee344..49f6ac7 100644 --- a/codes/README.md +++ b/codes/README.md @@ -45,4 +45,6 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0 [RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2) -[RL-Adventure](https://github.com/higgsfield/RL-Adventure) \ No newline at end of file +[RL-Adventure](https://github.com/higgsfield/RL-Adventure) + +[Google 开源项目风格指南——中文版](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments) \ No newline at end of file diff --git a/codes/SAC/model.py b/codes/SAC/model.py index 146db0d..85bbfcd 100644 --- a/codes/SAC/model.py +++ b/codes/SAC/model.py @@ -5,7 +5,7 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-04-29 12:53:58 LastEditor: JiangJi -LastEditTime: 2021-04-29 12:57:29 +LastEditTime: 2021-11-19 18:04:19 Discription: Environment: ''' @@ -35,12 +35,12 @@ class ValueNet(nn.Module): class SoftQNet(nn.Module): - def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): super(SoftQNet, self).__init__() - self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, 1) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -54,20 +54,20 @@ class SoftQNet(nn.Module): class PolicyNet(nn.Module): - def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_min=-20, log_std_max=2): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): super(PolicyNet, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max - self.linear1 = nn.Linear(num_inputs, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear1 = nn.Linear(state_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.mean_linear = nn.Linear(hidden_size, num_actions) + self.mean_linear = nn.Linear(hidden_dim, action_dim) self.mean_linear.weight.data.uniform_(-init_w, init_w) self.mean_linear.bias.data.uniform_(-init_w, init_w) - self.log_std_linear = nn.Linear(hidden_size, num_actions) + self.log_std_linear = nn.Linear(hidden_dim, action_dim) self.log_std_linear.weight.data.uniform_(-init_w, init_w) self.log_std_linear.bias.data.uniform_(-init_w, init_w) diff --git a/codes/common/model.py b/codes/common/model.py index be03368..27e5e4e 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -15,15 +15,15 @@ import torch.nn.functional as F from torch.distributions import Categorical class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): + def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态数 - n_actions: 输出的动作维度 + input_dim: 输入的特征数即环境的状态数 + output_dim: 输出的动作维度 """ super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 + self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -32,10 +32,10 @@ class MLP(nn.Module): return self.fc3(x) class Critic(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Critic, self).__init__() - self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) + self.linear1 = nn.Linear(n_obs + action_dim, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, 1) # 随机初始化为较小的值 @@ -51,11 +51,11 @@ class Critic(nn.Module): return x class Actor(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Actor, self).__init__() self.linear1 = nn.Linear(n_obs, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, n_actions) + self.linear3 = nn.Linear(hidden_size, action_dim) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -67,18 +67,18 @@ class Actor(nn.Module): return x class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim=256): + def __init__(self, state_dim, action_dim, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=1), ) diff --git a/codes/common/multiprocessing_env.py b/codes/common/multiprocessing_env.py index 04b4e3c..28c8aba 100644 --- a/codes/common/multiprocessing_env.py +++ b/codes/common/multiprocessing_env.py @@ -1,5 +1,5 @@ -#This code is from openai baseline -#https://github.com/openai/baselines/tree/master/baselines/common/vec_env +# 该代码来自 openai baseline,用于多线程环境 +# https://github.com/openai/baselines/tree/master/baselines/common/vec_env import numpy as np from multiprocessing import Process, Pipe diff --git a/codes/common/plot.py b/codes/common/plot.py deleted file mode 100644 index bc9c1dd..0000000 --- a/codes/common/plot.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-10-07 20:57:11 -LastEditor: John -LastEditTime: 2021-09-23 12:23:01 -Discription: -Environment: -''' -import matplotlib.pyplot as plt -import seaborn as sns -from matplotlib.font_manager import FontProperties # 导入字体模块 - -def plot_rewards(rewards,ma_rewards,plot_cfg,tag='train'): - sns.set() - plt.figure() # 创建一个图形实例,方便同时多画几个图 - plt.title("learning curve on {} of {} for {}".format(plot_cfg.device, plot_cfg.algo, plot_cfg.env_name)) - plt.xlabel('epsiodes') - plt.plot(rewards,label='rewards') - plt.plot(ma_rewards,label='ma rewards') - plt.legend() - if plot_cfg.save: - plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) - plt.show() - -def plot_losses(losses,algo = "DQN",save=True,path='./'): - sns.set() - plt.figure() - plt.title("loss curve of {}".format(algo)) - plt.xlabel('epsiodes') - plt.plot(losses,label='rewards') - plt.legend() - if save: - plt.savefig(path+"losses_curve") - plt.show() - diff --git a/codes/common/utils.py b/codes/common/utils.py index a3ca7be..6027804 100644 --- a/codes/common/utils.py +++ b/codes/common/utils.py @@ -5,29 +5,90 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 16:02:24 LastEditor: John -LastEditTime: 2021-09-11 21:48:49 +LastEditTime: 2021-11-30 18:39:19 Discription: Environment: ''' import os import numpy as np from pathlib import Path +import matplotlib.pyplot as plt +import seaborn as sns -def save_results(rewards,ma_rewards,tag='train',path='./results'): - '''save rewards and ma_rewards +from matplotlib.font_manager import FontProperties # 导入字体模块 + +def chinese_font(): + ''' 设置中文字体,注意需要根据自己电脑情况更改字体路径,否则还是默认的字体 + ''' + try: + font = FontProperties( + fname='/System/Library/Fonts/STHeiti Light.ttc', size=15) # fname系统字体路径,此处是mac的 + except: + font = None + return font + +def plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag='train'): + ''' 中文画图 + ''' + sns.set() + plt.figure() + plt.title(u"{}环境下{}算法的学习曲线".format(plot_cfg.env_name, + plot_cfg.algo_name), fontproperties=chinese_font()) + plt.xlabel(u'回合数', fontproperties=chinese_font()) + plt.plot(rewards) + plt.plot(ma_rewards) + plt.legend((u'奖励', u'滑动平均奖励',), loc="best", prop=chinese_font()) + if plot_cfg.save: + plt.savefig(plot_cfg.result_path+f"{tag}_rewards_curve_cn") + # plt.show() + + +def plot_rewards(rewards, ma_rewards, plot_cfg, tag='train'): + sns.set() + plt.figure() # 创建一个图形实例,方便同时多画几个图 + plt.title("learning curve on {} of {} for {}".format( + plot_cfg.device, plot_cfg.algo_name, plot_cfg.env_name)) + plt.xlabel('epsiodes') + plt.plot(rewards, label='rewards') + plt.plot(ma_rewards, label='ma rewards') + plt.legend() + if plot_cfg.save: + plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) + plt.show() + + +def plot_losses(losses, algo="DQN", save=True, path='./'): + sns.set() + plt.figure() + plt.title("loss curve of {}".format(algo)) + plt.xlabel('epsiodes') + plt.plot(losses, label='rewards') + plt.legend() + if save: + plt.savefig(path+"losses_curve") + plt.show() + + +def save_results(rewards, ma_rewards, tag='train', path='./results'): + ''' 保存奖励 ''' np.save(path+'{}_rewards.npy'.format(tag), rewards) np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) print('结果保存完毕!') + def make_dir(*paths): + ''' 创建文件夹 + ''' for path in paths: Path(path).mkdir(parents=True, exist_ok=True) + + def del_empty_dir(*paths): - '''del_empty_dir delete empty folders unders "paths" + ''' 删除目录下所有空文件夹 ''' for path in paths: dirs = os.listdir(path) for dir in dirs: if not os.listdir(os.path.join(path, dir)): - os.removedirs(os.path.join(path, dir)) \ No newline at end of file + os.removedirs(os.path.join(path, dir)) diff --git a/codes/envs/assets/gym_info_20211130180023.png b/codes/envs/assets/gym_info_20211130180023.png new file mode 100644 index 0000000..723b67f Binary files /dev/null and b/codes/envs/assets/gym_info_20211130180023.png differ diff --git a/codes/envs/gym_info.md b/codes/envs/gym_info.md index dd4268a..49da18f 100644 --- a/codes/envs/gym_info.md +++ b/codes/envs/gym_info.md @@ -1,4 +1,5 @@ -## 环境说明 +# OpenAi Gym 环境说明 +## 基础控制 ### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0) @@ -6,6 +7,17 @@ 通过向左或向右推车能够实现平衡,所以动作空间由两个动作组成。每进行一个step就会给一个reward,如果无法保持平衡那么done等于true,本次episode失败。理想状态下,每个episode至少能进行200个step,也就是说每个episode的reward总和至少为200,step数目至少为200 +### CartPole-v1 + +```CartPole v1```环境其实跟```CartPole v0```是一模一样的,区别在于每回合最大步数(max_episode_steps)以及奖励阈值(reward_threshold),如下是相关源码: + +![](assets/gym_info_20211130180023.png) + +这里先解释一下奖励阈值(reward_threshold),即Gym设置的一个合格标准,比如对于```CartPole v0```如果算法能够将奖励收敛到195以上,说明该算法合格。但实际上```CartPole v0```的每回合最大步数(max_episode_steps)是200,每步的奖励最大是1,也就是每回合最大奖励是200,比Gym设置的奖励阈值高。笔者猜测这是Gym可能是给算法学习者们设置的一个参考线,而实际中在写算法时并不会用到这个算法阈值,所以可以忽略。 + +再看每回合最大步数,可以看到```CartPole v1```的步数更长,相应的奖励要求更高,可以理解为```v1```是```v0```的难度升级版。 + + ### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0) 注:gym 0.18.0之后版本中Pendulum-v0已经改为Pendulum-v1 @@ -31,4 +43,8 @@ image-20201007211858925 -由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 \ No newline at end of file +由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 + +## 参考 + +[Gym环境相关源码](https://github.com/openai/gym/tree/master/gym/envs) \ No newline at end of file